diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml
index 72152f749a..6c721e3d54 100644
--- a/.github/workflows/osx.yml
+++ b/.github/workflows/osx.yml
@@ -2,12 +2,19 @@ name: github-OSX
 
 on:
   pull_request:
-    branches:
-      - master
-      - develop
+    types: [ opened, labeled, unlabeled, reopened, synchronize ]
 
 jobs:
+  check-pr-labels:
+    runs-on: [ubuntu-latest]
+    steps:
+      - uses: docker://agilepathway/pull-request-label-checker:latest
+        with:
+          none_of: 'AT: WIP'
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
   osxci:
+    needs: check-pr-labels
+    # TODO: allow re-run via retest label if: ${{ github.event.label.name == 'AT: RETEST' }}
     name: osx-ci
     runs-on: [macos-latest]
 
@@ -16,12 +23,16 @@ jobs:
         include:
           - backend: "SERIAL"
             cmake_build_type: "RelWithDebInfo"
+            debug_bounds_check: "ON"
           - backend: "THREADS"
             cmake_build_type: "RelWithDebInfo"
+            debug_bounds_check: "ON"
           - backend: "SERIAL"
             cmake_build_type: "Debug"
+            debug_bounds_check: "OFF"
           - backend: "SERIAL"
             cmake_build_type: "Release"
+            debug_bounds_check: "ON"
 
     steps:
       - name: checkout_kokkos_kernels
@@ -46,6 +57,7 @@ jobs:
           -DCMAKE_CXX_FLAGS="-Werror" \
           -DCMAKE_CXX_STANDARD=14 \
           -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+          -DKokkos_ENABLE_DEBUG_BOUNDS_CHECK:BOOL=${{ matrix.debug_bounds_check }} \
           -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
           -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \
           -DCMAKE_INSTALL_PREFIX=$PWD/../install \
@@ -73,6 +85,8 @@ jobs:
           -DKokkosKernels_INST_FLOAT=ON \
           -DKokkosKernels_INST_LAYOUTLEFT:BOOL=ON \
           -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON \
+          -DKokkosKernels_INST_OFFSET_INT=ON \
+          -DKokkosKernels_INST_OFFSET_SIZE_T=ON \
           -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \
           -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \
           ..
@@ -83,4 +97,4 @@ jobs:
 
       - name: test
         working-directory: kokkos-kernels/build
-        run: ctest -j2 --output-on-failure
\ No newline at end of file
+        run: ctest -j2 --output-on-failure --timeout 3600
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 97dce4835d..46c4eeaf5f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,119 @@
 # Change Log
 
+## [3.7.00](https://github.com/kokkos/kokkos-kernels/tree/3.7.00) (2022-08-18)
+[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.6.01...3.7.00)
+
+### Features:
+
+#### Final Bsr algorithms implemented for multigrid:
+- Sparse: bsr transpose algorithm [\#1477](https://github.com/kokkos/kokkos-kernels/pull/1477)
+- BSR block SpGEMM implementation [\#1099](https://github.com/kokkos/kokkos-kernels/pull/1099)
+
+#### Adding batched dense linear and non-linear system solvers:
+- Add batched GESV [\#1384](https://github.com/kokkos/kokkos-kernels/pull/1384)
+- Newton solver: serial on device implementation of Newton's method [\#1479](https://github.com/kokkos/kokkos-kernels/pull/1479)
+
+#### Add sparse matrix conversion:
+- Add csc2csr [\#1342](https://github.com/kokkos/kokkos-kernels/pull/1342)
+- csc2csr: update Kokkos_Numeric.hpp header inclusion [\#1449](https://github.com/kokkos/kokkos-kernels/pull/1449)
+- sparse: Remove csc2csr copy [\#1375](https://github.com/kokkos/kokkos-kernels/pull/1375)
+
+#### New documentation in readthedocs
+- Added https://kokkos-kernels.readthedocs.io [\#1451](https://github.com/kokkos/kokkos-kernels/pull/1451)
+- Restructure docs [\#1368](https://github.com/kokkos/kokkos-kernels/pull/1368)
+
+#### Fix issues with TPLs for mutlivector SPMV
+- Add cuSparse TPL files for CrsMatrix-multivector product [\#1427](https://github.com/kokkos/kokkos-kernels/pull/1427)
+
+### Deprecations:
+- Add template params to forwarding calls in deprecated KokkosKernels::… [\#1441](https://github.com/kokkos/kokkos-kernels/pull/1441)
+
+### Implemented enhancements:
+
+####
+- SPILUK: Move host allocations to symbolic [\#1480](https://github.com/kokkos/kokkos-kernels/pull/1480)
+- trsv: remove assumptions about entry order within rows [\#1463](https://github.com/kokkos/kokkos-kernels/pull/1463)
+
+#### Hierarchical BLAS algorithms, added and moved from batched:
+- Blas serial axpy and nrm2 [\#1460](https://github.com/kokkos/kokkos-kernels/pull/1460)
+- Move Set/Scale unit test to KokkosBlas [\#1455](https://github.com/kokkos/kokkos-kernels/pull/1455)
+- Move {Serial,Team,TeamVector} Set to KokkosBlas [\#1454](https://github.com/kokkos/kokkos-kernels/pull/1454)
+- Move {Serial,Team,TeamVector}Scale to KokkosBlas [\#1448](https://github.com/kokkos/kokkos-kernels/pull/1448)
+
+#### Code base organization and clean-ups:
+- Common Utils: removing dependency on Sparse Utils in Common Utils [\#1436](https://github.com/kokkos/kokkos-kernels/pull/1436)
+- Common cleanup [\#1431](https://github.com/kokkos/kokkos-kernels/pull/1431)
+- Clean-up src: re-organizing the src directory [\#1398](https://github.com/kokkos/kokkos-kernels/pull/1398)
+- Sparse utils namespace [\#1439](https://github.com/kokkos/kokkos-kernels/pull/1439)
+
+#### perf tests updates, fixes and clean-ups:
+- dot perf test: adding support for HIP and SYCL backend [\#1453](https://github.com/kokkos/kokkos-kernels/pull/1453)
+- Add verbosity parameter to GMRES example. Turn off for testing. [\#1385](https://github.com/kokkos/kokkos-kernels/pull/1385)
+- KokkosSparse_spiluk.cpp perf test: add int-int guards to cusparse codes [\#1369](https://github.com/kokkos/kokkos-kernels/pull/1369)
+- perf_test/blas: Check ARMPL build version [\#1352](https://github.com/kokkos/kokkos-kernels/pull/1352)
+- Clean-up batched block tridiag perf test [\#1343](https://github.com/kokkos/kokkos-kernels/pull/1343)
+- Reduce lots of macro duplication in sparse unit tests [\#1340](https://github.com/kokkos/kokkos-kernels/pull/1340)
+
+#### Infrastructure changes: ETI and testing upgrades, minor fixes
+- sycl: re-enabling test now that dpcpp has made progress [\#1473](https://github.com/kokkos/kokkos-kernels/pull/1473)
+- Only instantiate Kokkos's default Cuda mem space [\#1361](https://github.com/kokkos/kokkos-kernels/pull/1361)
+- Sparse and CI updates [\#1411](https://github.com/kokkos/kokkos-kernels/pull/1411)
+- Newer sparse tests were not following the new testing pattern [\#1356](https://github.com/kokkos/kokkos-kernels/pull/1356)
+- Add ETI for D1 coloring [\#1401](https://github.com/kokkos/kokkos-kernels/pull/1401)
+- Add ETI to SpAdd (symbolic and numeric) [\#1399](https://github.com/kokkos/kokkos-kernels/pull/1399)
+- Reformat example/fenl files changed in 1382 [\#1464](https://github.com/kokkos/kokkos-kernels/pull/1464)
+- Change Controls::getParameter error message from stdout to stderr [\#1416](https://github.com/kokkos/kokkos-kernels/pull/1416)
+
+#### Kokkos alignment: update our implementations to use newer Kokkos features
+- Arith traits integral nan [\#1438](https://github.com/kokkos/kokkos-kernels/pull/1438)
+- Kokkos_ArithTraits: re-implementation using Kokkos Core [\#1406](https://github.com/kokkos/kokkos-kernels/pull/1406)
+- Value-initialize result of MaxLoc reduction to avoid maybe uninitialized warning [\#1383](https://github.com/kokkos/kokkos-kernels/pull/1383)
+- Remove volatile qualifiers in reducer join(), init(), and operator+= methods [\#1382](https://github.com/kokkos/kokkos-kernels/pull/1382)
+
+#### BLAS and batched algorithms updates
+- Update Batched GMRES [\#1392](https://github.com/kokkos/kokkos-kernels/pull/1392)
+- GEMV: accumulate in float for scalar = bhalf_t [\#1360](https://github.com/kokkos/kokkos-kernels/pull/1360)
+- Restore BLAS-1 MV paths for 1 column [\#1354](https://github.com/kokkos/kokkos-kernels/pull/1354)
+
+#### Sparse and Graph updates
+- Minor updates to cluster Gauss-Seidel [\#1372](https://github.com/kokkos/kokkos-kernels/pull/1372)
+- Add unit test for BsrMatrix and BlockCrsMatrix spmv [\#1338](https://github.com/kokkos/kokkos-kernels/pull/1338)
+- Refactor SPGEMM MKL Impl [\#1244](https://github.com/kokkos/kokkos-kernels/pull/1244)
+- D1 coloring: remove unused but set variable [\#1403](https://github.com/kokkos/kokkos-kernels/pull/1403)
+
+#### half precision paper
+- Minor changes for half precision paper [\#1429](https://github.com/kokkos/kokkos-kernels/pull/1429)
+- Add benchmarks for us-rse escience 2022 half precision paper [\#1422](https://github.com/kokkos/kokkos-kernels/pull/1422)
+
+
+### Bug Fixes:
+- TPLs: adding CUBLAS in the list of dependencies [\#1482](https://github.com/kokkos/kokkos-kernels/pull/1482)
+- Fix MKL build errors [\#1478](https://github.com/kokkos/kokkos-kernels/pull/1478)
+- Fixup drop layout template param in rank-0 views [\#1476](https://github.com/kokkos/kokkos-kernels/pull/1476)
+- BLAS: fixing test that access results before synching [\#1472](https://github.com/kokkos/kokkos-kernels/pull/1472)
+- Fix D1 color ETI with both CudaSpace and UVM [\#1471](https://github.com/kokkos/kokkos-kernels/pull/1471)
+- Fix arithtraits warning [\#1468](https://github.com/kokkos/kokkos-kernels/pull/1468)
+- Fix build when double not instantiated [\#1467](https://github.com/kokkos/kokkos-kernels/pull/1467)
+- Fix -Werror [\#1466](https://github.com/kokkos/kokkos-kernels/pull/1466)
+- Fix GitHub CI failing on broken develop [\#1461](https://github.com/kokkos/kokkos-kernels/pull/1461)
+- HIP: fix warning from ExecSpaceUtils and GEMV [\#1459](https://github.com/kokkos/kokkos-kernels/pull/1459)
+- Removes a duplicate cuda_data_type_from when KOKKOS_HALF_T_IS_FLOAT [\#1456](https://github.com/kokkos/kokkos-kernels/pull/1456)
+- Fix incorrect function call in KokkosBatched::TeamGEMV unit test [\#1444](https://github.com/kokkos/kokkos-kernels/pull/1444)
+- Fix SYCL nightly test [\#1419](https://github.com/kokkos/kokkos-kernels/pull/1419)
+- Fix issues with cuSparse TPL availability for BsrMatrix SpMV [\#1418](https://github.com/kokkos/kokkos-kernels/pull/1418)
+- SpMV: fixing issues with unit-tests tolerance [\#1412](https://github.com/kokkos/kokkos-kernels/pull/1412)
+- Address 1409 [\#1410](https://github.com/kokkos/kokkos-kernels/pull/1410)
+- Fix colliding include guards (copy-paste mistake) [\#1408](https://github.com/kokkos/kokkos-kernels/pull/1408)
+- src/sparse: Fix & check for fence post errors [\#1405](https://github.com/kokkos/kokkos-kernels/pull/1405)
+- Bspgemm fixes [\#1396](https://github.com/kokkos/kokkos-kernels/pull/1396)
+- Fix unused parameter warnings in GEMM test. [\#1381](https://github.com/kokkos/kokkos-kernels/pull/1381)
+- Fixes code deprecation warnings. [\#1379](https://github.com/kokkos/kokkos-kernels/pull/1379)
+- Fix sign-compare warning in SPMV perf test [\#1371](https://github.com/kokkos/kokkos-kernels/pull/1371)
+- Minor MKL fixes [\#1365](https://github.com/kokkos/kokkos-kernels/pull/1365)
+- perf_test/batched: Temporarily disable tests [\#1359](https://github.com/kokkos/kokkos-kernels/pull/1359)
+- Fix nightly builds following promotion of the math functions in Kokkos [\#1339](https://github.com/kokkos/kokkos-kernels/pull/1339)
+
+
 ## [3.6.01](https://github.com/kokkos/kokkos-kernels/tree/3.6.01) (2022-05-23)
 [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.6.00...3.6.01)
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ba5323df27..40d6dd407b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,8 +24,8 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS)
     PROJECT(KokkosKernels CXX)
   ENDIF()
   SET(KokkosKernels_VERSION_MAJOR 3)
-  SET(KokkosKernels_VERSION_MINOR 6)
-  SET(KokkosKernels_VERSION_PATCH 01)
+  SET(KokkosKernels_VERSION_MINOR 7)
+  SET(KokkosKernels_VERSION_PATCH 00)
   SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}")
   MATH(EXPR KOKKOSKERNELS_VERSION "${KokkosKernels_VERSION_MAJOR} * 10000 + ${KokkosKernels_VERSION_MINOR} * 100 + ${KokkosKernels_VERSION_PATCH}")
 ENDIF()
@@ -35,7 +35,7 @@ CMAKE_POLICY(SET CMP0074 NEW)
 
 INCLUDE(GNUInstallDirs)
 IF (KOKKOSKERNELS_HAS_TRILINOS)
- SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
+ SET(TRILINOS_INCDIR ${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
  SET(KOKKOSKERNELS_HEADER_INSTALL_DIR ${TRILINOS_INCDIR})
  SET(KOKKOS_ENABLE_CUDA_UVM ${Kokkos_ENABLE_CUDA_UVM})
 ELSEIF(KOKKOSKERNELS_HAS_PARENT)
diff --git a/README.md b/README.md
index 08f80c19d6..58127b912e 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+[![Generic badge](https://readthedocs.org/projects/pip/badge/?version=latest&style=flat)](https://kokkos-kernels.readthedocs.io/en/latest/)
+
 ![KokkosKernels](https://avatars2.githubusercontent.com/u/10199860?s=200&v=4)
 
 # Kokkos Kernels
diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash
index b26ba7be97..ee195ca0fe 100755
--- a/cm_generate_makefile.bash
+++ b/cm_generate_makefile.bash
@@ -230,7 +230,7 @@ display_help_text() {
       echo "--with-openmptarget:                          Enable OpenMPTarget backend."
       echo "--with-sycl:                                  Enable Sycl backend."
       echo "--with-openmp:                                Enable OpenMP backend."
-      echo "--with-pthread:                               Enable Pthreads backend."
+      echo "--with-threads:                               Enable Threads backend."
       echo "--with-serial:                                Enable Serial backend."
       echo "--with-devices:                               Explicitly add a set of backends."
       echo ""
@@ -274,6 +274,8 @@ display_help_text() {
       echo "                 Pascal61        = NVIDIA Pascal generation CC 6.1"
       echo "                 Volta70         = NVIDIA Volta generation CC 7.0"
       echo "                 Volta72         = NVIDIA Volta generation CC 7.2"
+      echo "                 Ampere80        = NVIDIA Ampere generation CC 8.0"
+      echo "                 Ampere86        = NVIDIA Ampere generation CC 8.6"
       echo ""
       echo "--compiler=/Path/To/Compiler  Set the compiler."
       echo ""
@@ -335,6 +337,7 @@ display_help_text() {
       echo "--kokkos-make-j=[NUM]:        Set -j parallel level for kokkos install"
       echo "                                Default: j == 4"
       echo "--enable-tests: build Kokkos Kernels unit and performance tests"
+      echo "--deprecated-code             Enable deprecated code (disabled by default)"
       echo "--enable-perfsuite: build Kokkos Kernels performance tests with
 RAJAPerf Suite"
 
@@ -360,6 +363,8 @@ KERNELS_DEFAULT_ETI_OPTION=""
 WITH_CUDA_BACKEND=OFF
 WITH_HIP_BACKEND=OFF
 
+KOKKOS_DEPRECATED_CODE=OFF
+
 while [[ $# > 0 ]]
 do
   key="$1"
@@ -415,8 +420,8 @@ do
     --with-sycl)
       update_kokkos_devices Sycl
       ;;
-    --with-pthread)
-      update_kokkos_devices Pthread
+    --with-threads)
+      update_kokkos_devices Threads
       ;;
     --with-serial)
       update_kokkos_devices Serial
@@ -522,6 +527,9 @@ do
     --disable-examples)
       KOKKOSKERNELS_DO_EXAMPLES=OFF
       ;;
+    --deprecated-code)
+      KOKKOS_DEPRECATED_CODE=ON
+      ;;
     --compiler*)
       COMPILER="${key#*=}"
       CNUM=$(command -v ${COMPILER} 2>&1 >/dev/null | grep "no ${COMPILER}" | wc -l)
@@ -738,9 +746,9 @@ cd ${KOKKOS_INSTALL_PATH}
 
 # Configure kokkos
 echo ""
-echo cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH}
+echo cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_3=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH}
 echo ""
-cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES}${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH}
+cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES}${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_3=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH}
 
 # Install kokkos library
 make install -j $KOKKOS_MAKEINSTALL_J
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 2dcedcc1c9..e8b1c6a5e2 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1,7 +1,12 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
         LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms
-        LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA METIS SuperLU Cholmod LAPACKE CBLAS ARMPL ROCBLAS ROCSPARSE
+        LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA METIS SuperLU Cholmod LAPACKE CBLAS ARMPL ROCBLAS ROCSPARSE CUBLAS
         TEST_OPTIONAL_TPLS yaml-cpp
 )
 # NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in
-# the macro 'KOKKOSKERNELS_ADD_TPL_OPTION' that resides in cmake/kokkoskernels_tpls.cmake.
\ No newline at end of file
+# the macro 'KOKKOSKERNELS_ADD_TPL_OPTION' that resides in cmake/kokkoskernels_tpls.cmake.
+
+if (TPL_ENABLE_CUDA)
+  tribits_tpl_tentatively_enable(CUBLAS)
+endif()
+
diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in
index f8dd2ae133..1fb6a31544 100644
--- a/cmake/KokkosKernels_config.h.in
+++ b/cmake/KokkosKernels_config.h.in
@@ -70,6 +70,8 @@
 #cmakedefine KOKKOSKERNELS_INST_FLOAT
 /* Whether to build kernels for scalar type Kokkos::Experimental::half_t */
 #cmakedefine KOKKOSKERNELS_INST_HALF
+/* Whether to build kernels for scalar type Kokkos::Experimental::bhalf_t */
+#cmakedefine KOKKOSKERNELS_INST_BHALF
 /* Whether to build kernels for scalar type complex<double> */
 #cmakedefine KOKKOSKERNELS_INST_COMPLEX_DOUBLE
 /* Whether to build kernels for scalar type complex<float> */
diff --git a/cmake/Modules/FindTPLMKL.cmake b/cmake/Modules/FindTPLMKL.cmake
index 5766e0f5b0..56f4f34c9e 100644
--- a/cmake/Modules/FindTPLMKL.cmake
+++ b/cmake/Modules/FindTPLMKL.cmake
@@ -41,6 +41,10 @@ ELSE()
       LIBRARY_PATHS
         ${MKL_ROOT}/lib/intel64
         ${ENV_LIBDIRS}
+      HEADER
+        mkl.h
+      HEADER_PATHS
+        ${MKL_ROOT}/include
     )
   ENDIF()
 ENDIF()
diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake
index 47dce1f9d1..9395cec564 100644
--- a/cmake/kokkoskernels_eti_devices.cmake
+++ b/cmake/kokkoskernels_eti_devices.cmake
@@ -41,19 +41,29 @@ SET(MEMSPACE_HBWSPACE_CPP_TYPE          Kokkos::HBWSpace)
 IF(KOKKOS_ENABLE_CUDA)
  KOKKOSKERNELS_ADD_OPTION(
    INST_EXECSPACE_CUDA
-   ${KOKKOSKERNELS_INST_EXECSPACE_CUDA_DEFAULT}
+   ON
    BOOL
    "Whether to pre instantiate kernels for the execution space Kokkos::Cuda. Disabling this when Kokkos_ENABLE_CUDA is enabled may increase build times. Default: ON if Kokkos is CUDA-enabled, OFF otherwise."
    )
+
+ # By default, instantiate only for Cuda's default memory space (either CudaSpace, or CudaUVMSpace).
+ IF(KOKKOS_ENABLE_CUDA_UVM)
+   SET(CUDA_CUDAUVMSPACE_DEFAULT ON)
+   SET(CUDA_CUDASPACE_DEFAULT OFF)
+ ELSE()
+   SET(CUDA_CUDAUVMSPACE_DEFAULT OFF)
+   SET(CUDA_CUDASPACE_DEFAULT ON)
+ ENDIF()
+
  KOKKOSKERNELS_ADD_OPTION(
    INST_MEMSPACE_CUDAUVMSPACE
-   ${KOKKOSKERNELS_INST_EXECSPACE_CUDA_DEFAULT}
+   ${CUDA_CUDAUVMSPACE_DEFAULT}
    BOOL
    "Whether to pre instantiate kernels for the memory space Kokkos::CudaUVMSpace.  Disabling this when Kokkos_ENABLE_CUDA is enabled may increase build times. Default: ON if Kokkos is CUDA-enabled, OFF otherwise."
    )
  KOKKOSKERNELS_ADD_OPTION(
    INST_MEMSPACE_CUDASPACE
-   ${KOKKOSKERNELS_INST_EXECSPACE_CUDA_DEFAULT}
+   ${CUDA_CUDASPACE_DEFAULT}
    BOOL
    "Whether to pre instantiate kernels for the memory space Kokkos::CudaSpace.  Disabling this when Kokkos_ENABLE_CUDA is enabled may increase build times. Default: ON if Kokkos is CUDA-enabled, OFF otherwise."
    )
diff --git a/cmake/kokkoskernels_eti_floats.cmake b/cmake/kokkoskernels_eti_floats.cmake
index debf99bb0e..3448874336 100644
--- a/cmake/kokkoskernels_eti_floats.cmake
+++ b/cmake/kokkoskernels_eti_floats.cmake
@@ -25,6 +25,13 @@ KOKKOSKERNELS_ADD_OPTION(
         "Whether to pre instantiate kernels for the scalar type Kokkos::Experimental::half_t.  Disabling this may increase build times. Default: OFF"
 )
 
+KOKKOSKERNELS_ADD_OPTION(
+        INST_BHALF
+        OFF
+        BOOL
+        "Whether to pre instantiate kernels for the scalar type Kokkos::Experimental::bhalf_t.  Disabling this may increase build times. Default: OFF"
+)
+
 SET(FLOATS
   FLOAT
   DOUBLE
@@ -33,6 +40,7 @@ SET(FLOATS
 SET(DOUBLE_CPP_TYPE "double")
 SET(FLOAT_CPP_TYPE "float")
 SET(HALF_CPP_TYPE "Kokkos::Experimental::half_t")
+SET(BHALF_CPP_TYPE "Kokkos::Experimental::bhalf_t")
 SET(COMPLEX_FLOAT_CPP_TYPE "Kokkos::complex<float>")
 SET(COMPLEX_DOUBLE_CPP_TYPE "Kokkos::complex<double>")
 
diff --git a/cmake/kokkoskernels_eti_offsets.cmake b/cmake/kokkoskernels_eti_offsets.cmake
index 171223010c..484175a976 100644
--- a/cmake/kokkoskernels_eti_offsets.cmake
+++ b/cmake/kokkoskernels_eti_offsets.cmake
@@ -1,5 +1,5 @@
 SET(KOKKOSKERNELS_INST_OFFSET_SIZE_T_DEFAULT ${KOKKOSKERNELS_ADD_DEFAULT_ETI})
-SET(KOKKOSKERNELS_INST_OFFSET_INT_DEFAULT ${KOKKOSKERNELS_ADD_DEFAULT_ETI})
+SET(KOKKOSKERNELS_INST_OFFSET_INT_DEFAULT OFF)
 SET(OFFSETS
   OFFSET_INT
   OFFSET_SIZE_T
@@ -12,14 +12,14 @@ KOKKOSKERNELS_ADD_OPTION(
   INST_OFFSET_INT
   ${KOKKOSKERNELS_INST_OFFSET_INT_DEFAULT}
   BOOL
-  "Whether to pre instantiate kernels for the offset type int.  This option is KokkosKernels_INST_OFFSET_INT=ON by default. Default: ON"
+  "Whether to pre instantiate kernels for the offset type int.  This option is KokkosKernels_INST_OFFSET_INT=OFF by default. Default: OFF"
   )
 
 KOKKOSKERNELS_ADD_OPTION(
   INST_OFFSET_SIZE_T
   ${KOKKOSKERNELS_INST_OFFSET_SIZE_T_DEFAULT}
   BOOL
-  "Whether to pre instantiate kernels for the offset type size_t.  This option is KokkosKernels_INST_OFFSET_SIZE_T=OFF by default. Default: ON"
+  "Whether to pre instantiate kernels for the offset type size_t.  This option is KokkosKernels_INST_OFFSET_SIZE_T=ON by default. Default: ON"
   )
 
 IF (KOKKOSKERNELS_INST_OFFSET_INT)
diff --git a/docs/conf.py b/docs/conf.py
index efb406329b..59377e4f11 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -18,11 +18,11 @@
 # -- Project information -----------------------------------------------------
 
 project = 'Kokkos Kernels'
-copyright = '2021, Evan Harvey'
-author = 'Evan Harvey'
+copyright = '2022, Kokkos Development Team'
+author = 'Kokkos Team'
 
 # The full version, including alpha/beta/rc tags
-release = 'v3.4.1'
+release = 'latest'
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/docs/developer/apidocs.rst b/docs/developer/apidocs.rst
new file mode 100644
index 0000000000..82797c5801
--- /dev/null
+++ b/docs/developer/apidocs.rst
@@ -0,0 +1,14 @@
+Source Code Documentation
+=========================
+
+The source documentation is extracted from the C++ files using Doxygen.
+
+.. toctree::
+   :maxdepth: 4
+
+   apidocs/blas1
+   apidocs/blas2
+   apidocs/blas3
+   apidocs/sparse
+   apidocs/batched_dense
+   apidocs/batched_sparse
\ No newline at end of file
diff --git a/docs/developer/apidocs/batched_dense.rst b/docs/developer/apidocs/batched_dense.rst
new file mode 100644
index 0000000000..1d65842061
--- /dev/null
+++ b/docs/developer/apidocs/batched_dense.rst
@@ -0,0 +1,257 @@
+BATCHED -- KokkosKernels batched functor-level interfaces
+=========================================================
+
+innerlu
+-------
+CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InnerLU_Internal.hpp
+
+applypivot
+----------
+.. doxygenstruct:: KokkosBatched::TeamVectorApplyPivot
+    :members:
+
+qr_withcolumnpivoting
+---------------------
+.. doxygenstruct:: KokkosBatched::TeamVectorQR_WithColumnPivoting
+    :members:
+
+addradial
+---------
+.. doxygenstruct:: KokkosBatched::SerialAddRadial
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamAddRadial
+    :members:
+
+householder
+-----------
+.. doxygenstruct:: KokkosBatched::SerialHouseholder
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamVectorHouseholder
+    :members:
+
+set
+---
+.. doxygenstruct:: KokkosBatched::SerialSet
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamSet
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamVectorSet
+    :members:
+
+scale
+-----
+.. doxygenstruct:: KokkosBatched::SerialScale
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamScale
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamVectorScale
+    :members:
+
+setidentity
+-----------
+.. doxygenstruct:: KokkosBatched::SerialSetIdentity
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamSetIdentity
+    :members:
+.. doxygenstruct:: KokkosBatched::SetIdentity
+    :members:
+
+applyhouseholder
+----------------
+.. doxygenstruct:: KokkosBatched::SerialApplyHouseholder
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamVectorApplyHouseholder
+    :members:
+
+innermultipledotproduct
+-----------------------
+CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InnerMultipleDotProduct_Internal.hpp
+
+lu
+--
+.. doxygenstruct:: KokkosBatched::SerialLU
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamLU
+    :members:
+.. doxygenstruct:: KokkosBatched::LU
+    :members:
+
+solveutv
+--------
+.. doxygenstruct:: KokkosBatched::TeamVectorSolveUTV
+    :members:
+
+utv
+---
+.. doxygenstruct:: KokkosBatched::TeamVectorUTV
+    :members:
+
+inverselu
+---------
+CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InverseLU_Internal.hpp
+
+svd
+---
+.. doxygenstruct:: KokkosBatched::SerialSVD
+    :members:
+
+eigendecomposition
+------------------
+.. doxygenstruct:: KokkosBatched::SerialEigendecomposition
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamVectorEigendecomposition
+    :members:
+
+trtri
+-----
+.. doxygenstruct:: KokkosBatched::SerialTrtri
+    :members:
+
+qr
+--
+.. doxygenstruct:: KokkosBatched::SerialQR
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamQR
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamVectorQR
+    :members:
+.. doxygenstruct:: KokkosBatched::QR
+    :members:
+
+trmm
+----
+.. doxygenstruct:: KokkosBatched::SerialTrmm
+    :members:
+
+trsm
+----
+.. doxygenstruct:: KokkosBatched::SerialTrsm
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamTrsm
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamVectorTrsm
+    :members:
+.. doxygenstruct:: KokkosBatched::Trsm
+    :members:
+
+innergemmfixa
+-------------
+CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InnerGemmFixA_Internal.hpp
+
+innergemmfixb
+-------------
+CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InnerGemmFixB_Internal.hpp
+
+innergemmfixc
+-------------
+CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InnerGemmFixC_Internal.hpp
+
+applyq
+------
+.. doxygenstruct:: KokkosBatched::SerialApplyQ
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamApplyQ
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamVectorApplyQ
+    :members:
+.. doxygenstruct:: KokkosBatched::ApplyQ
+    :members:
+
+copy
+----
+.. doxygenstruct:: KokkosBatched::SerialCopy
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamCopy
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamVectorCopy
+    :members:
+.. doxygenstruct:: KokkosBatched::Copy
+    :members:
+
+innertrsm
+---------
+CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InnerTrsm_Internal.hpp
+
+solvelu
+-------
+.. doxygenstruct:: KokkosBatched::SerialSolveLU
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamSolveLU
+    :members:
+.. doxygenstruct:: KokkosBatched::SolveLU
+    :members:
+
+xpay
+----
+.. doxygenstruct:: KokkosBatched::SerialXpay
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamXpay
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamVectorXpay
+    :members:
+
+axpy
+----
+.. doxygenstruct:: KokkosBatched::SerialAxpy
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamAxpy
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamVectorAxpy
+    :members:
+
+gemv
+----
+.. doxygenstruct:: KokkosBatched::SerialGemv
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamGemv
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamVectorGemv
+    :members:
+.. doxygenstruct:: KokkosBatched::Gemv
+    :members:
+
+dot
+---
+.. doxygenstruct:: KokkosBatched::SerialDot
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamDot
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamVectorDot
+    :members:
+
+hadamardproduct
+---------------
+.. doxygenstruct:: KokkosBatched::SerialHadamardProduct
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamHadamardProduct
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamVectorHadamardProduct
+    :members:
+.. doxygenstruct:: KokkosBatched::HadamardProduct
+    :members:
+
+vector
+------
+CodeCleanup-TODO: Move Decl file to dense/impl/
+
+trsv
+----
+.. doxygenstruct:: KokkosBatched::SerialTrsv
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamTrsv
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamVectorTrsv
+    :members:
+.. doxygenstruct:: KokkosBatched::Trsv
+    :members:
+
+gemm
+----
+.. doxygenstruct:: KokkosBatched::SerialGemm
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamGemm
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamVectorGemm
+    :members:
+.. doxygenstruct:: KokkosBatched::Gemm
+    :members:
\ No newline at end of file
diff --git a/docs/developer/apidocs/batched_sparse.rst b/docs/developer/apidocs/batched_sparse.rst
new file mode 100644
index 0000000000..48031bc550
--- /dev/null
+++ b/docs/developer/apidocs/batched_sparse.rst
@@ -0,0 +1,43 @@
+SPARSE BATCHED -- KokkosKernels sparse batched functor-level interfaces
+=======================================================================
+
+cg
+--
+.. doxygenstruct:: KokkosBatched::CG
+    :members:
+
+crsmatrix
+---------
+.. doxygenclass:: KokkosBatched::CrsMatrix
+    :members:
+
+gmres
+-----
+.. doxygenstruct:: KokkosBatched::GMRES
+    :members:
+
+identity
+--------
+.. doxygenclass:: KokkosBatched::Identity
+    :members:
+
+jacobiprec
+----------
+.. doxygenclass:: KokkosBatched::JacobiPrec
+    :members:
+
+krylovhandle
+------------
+.. doxygenclass:: KokkosBatched::KrylovHandle
+    :members:
+
+spmv
+----
+.. doxygenstruct:: KokkosBatched::SerialSpmv
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamSpmv
+    :members:
+.. doxygenstruct:: KokkosBatched::TeamVectorSpmv
+    :members:
+.. doxygenstruct:: KokkosBatched::Spmv
+    :members:
\ No newline at end of file
diff --git a/docs/developer/apidocs/blas1.rst b/docs/developer/apidocs/blas1.rst
new file mode 100644
index 0000000000..bfeb7fd1bb
--- /dev/null
+++ b/docs/developer/apidocs/blas1.rst
@@ -0,0 +1,55 @@
+BLAS1 -- KokkosKernels blas1 interfaces
+=======================================
+
+axpby
+-----
+.. doxygenfunction:: KokkosBlas::axpby
+
+dot
+---
+.. doxygenfunction:: KokkosBlas::dot(const RV &, const XMV &, const YMV &, typename std::enable_if<Kokkos::is_view<RV>::value, int>::type = 0)
+.. doxygenfunction:: KokkosBlas::dot(const XVector &, const YVector &)
+
+fill
+----
+.. doxygenfunction:: KokkosBlas::fill
+
+mult
+----
+.. doxygenfunction:: KokkosBlas::mult
+
+nrm1
+----
+.. doxygenfunction:: KokkosBlas::nrm1(const RV &, const XMV &, typename std::enable_if<Kokkos::is_view<RV>::value, int>::type = 0)
+.. doxygenfunction:: KokkosBlas::nrm1(const XVector &)
+
+nrm2
+----
+.. doxygenfunction:: KokkosBlas::nrm2(const RV &R, const XMV &X, typename std::enable_if<Kokkos::is_view<RV>::value, int>::type = 0)
+.. doxygenfunction:: KokkosBlas::nrm2(const XVector &x)
+
+nrm2w
+-----
+.. doxygenfunction:: KokkosBlas::nrm2w(const RV &R, const XMV &X, const XMV &W, typename std::enable_if<Kokkos::is_view<RV>::value, int>::type = 0)
+.. doxygenfunction:: KokkosBlas::nrm2w(const XVector &x, const XVector &w)
+
+nrminf
+------
+.. doxygenfunction:: KokkosBlas::nrminf(const RV &R, const XMV &X, typename std::enable_if<Kokkos::is_view<RV>::value, int>::type = 0)
+.. doxygenfunction:: KokkosBlas::nrminf(const XVector &x)
+
+reciprocal
+----------
+.. doxygenfunction:: KokkosBlas::reciprocal
+
+scal
+----
+.. doxygenfunction:: KokkosBlas::scal
+
+sum
+---
+.. doxygenfunction:: KokkosBlas::sum(const RV &R, const XMV &X, typename std::enable_if<Kokkos::is_view<RV>::value, int>::type = 0)
+
+update
+------
+.. doxygenfunction:: KokkosBlas::update
diff --git a/docs/developer/apidocs/blas2.rst b/docs/developer/apidocs/blas2.rst
new file mode 100644
index 0000000000..1d9a3f3fa7
--- /dev/null
+++ b/docs/developer/apidocs/blas2.rst
@@ -0,0 +1,7 @@
+BLAS2 -- KokkosKernels blas2 interfaces
+=======================================
+
+gemv
+----
+.. doxygenfunction:: KokkosBlas::gemv(const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y)
+.. doxygenfunction:: KokkosBlas::gemv(const typename AViewType::execution_space &space, const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y)
diff --git a/docs/developer/apidocs/blas3.rst b/docs/developer/apidocs/blas3.rst
new file mode 100644
index 0000000000..810b28a5a3
--- /dev/null
+++ b/docs/developer/apidocs/blas3.rst
@@ -0,0 +1,8 @@
+BLAS3 -- KokkosKernels blas3 interfaces
+=======================================
+
+gemm
+----
+.. doxygenfunction:: KokkosBlas::gemm(const char transA, const char transB, AMat::const_value_type alpha, const AMat &a, const BMat &b, CMat::const_value_type beta, const CMat &c)
+.. doxygenfunction:: KokkosBlas::gemm(const char transA[], const char transB[], typename AViewType::const_value_type &alpha, const AViewType &A, const BViewType &B, typename CViewType::const_value_type &beta, const CViewType &C)
+.. doxygenfunction:: KokkosBlas::gemm(const typename CViewType::execution_space &space, const char transA[], const char transB[], typename AViewType::const_value_type &alpha, const AViewType &A, const BViewType &B, typename CViewType::const_value_type &beta, const CViewType &C)
diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst
new file mode 100644
index 0000000000..84ec48a519
--- /dev/null
+++ b/docs/developer/apidocs/sparse.rst
@@ -0,0 +1,27 @@
+SPARSE -- KokkosKernels sparse interfaces
+=========================================
+
+crsmatrix
+---------
+.. doxygenclass::    KokkosSparse::CrsMatrix
+    :members:
+
+spmv
+----
+.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls, const char[], const AlphaType&, const AMatrix&, const XVector&, const BetaType&, const YVector&)
+.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y)
+.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_ONE)
+.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_TWO)
+.. doxygenfunction:: KokkosSparse::spmv(const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y)
+
+trsv
+----
+.. doxygenfunction:: KokkosSparse::trsv
+
+spgemm
+------
+.. doxygenfunction:: KokkosSparse::spgemm
+
+gauss
+-----
+.. doxygenfunction:: KokkosSparse::gauss
diff --git a/docs/developer/build_doc.rst b/docs/developer/build_doc.rst
new file mode 100644
index 0000000000..dd3d357286
--- /dev/null
+++ b/docs/developer/build_doc.rst
@@ -0,0 +1,18 @@
+Building Developer Documentation
+================================
+
+.. code-block::
+    :caption: Installing dependencies on MacOS
+
+        brew install doxygen
+        pip install sphinx
+        pip install breathe
+        pip install sphinx-rtd-theme
+
+.. code-block::
+    :caption: How to build developer documentation
+
+        cmake -DKokkosKernels_ENABLE_DOCS:BOOL=ON /path/to/kokkos-kernels
+        make Doxygen
+        make Sphinx
+        open build/docs/docs/sphinx/index.html
\ No newline at end of file
diff --git a/docs/developer/contrib.rst b/docs/developer/contrib.rst
new file mode 100644
index 0000000000..0b02ebf190
--- /dev/null
+++ b/docs/developer/contrib.rst
@@ -0,0 +1,46 @@
+Contributing
+============
+
+Comment Style
+-------------
+We follow doxygen style comments for both external (API) and internal members. See https://www.doxygen.nl/manual/docblocks.html for details.
+Our documentation can be generated using the `-DKokkosKernels_ENABLE_DOCS:BOOL=ON` cmake flag; see `Building the Documentation`.
+
+In general, we prefer that the prototype has the doxygen style comment rather than the definition. If there is no prototype, then the definition should have the doxygen style comment.
+
+.. code-block::
+    :caption: API Doxygen Style Example
+
+        /// \brief Blocking wrapper for accessing a Kokkos View.
+        /// \tparam ViewValueType The value type (Scalar or Vector) of each view element
+        /// \tparam ViewType The view type
+        /// \param v The view handle
+        /// \param m The requested row index of v
+        /// \param n The requested col index of v
+        /// \return If m and n are within the extents of v, a valid element of v;
+        ///         otherwise, the last element of v.
+        ///
+        template <class ViewValueType, class ViewType>
+        KOKKOS_INLINE_FUNCTION ViewValueType
+        access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::Yes &);
+
+Library policies
+----------------
+
+System-specific functions
+-------------------------
+For portability, any system-specific function that is not in the C++ standard should not be invoked from kokkos-kernels.
+
+Upcasting and downcasting
+-------------------------
+TODO
+
+Blocking and non-blocking interfaces
+------------------------------------
+All the APIs are non-blocking unless:
+1. A TPL is enabled
+2. The result vector resides on the host and work is offloaded to a device
+
+When a TPL is enabled, we follow the blocking semantics of the TPL interface.
+
+If no TPLs are enabled, callers can avoid blocking calls by using any overload which accepts a result vector type as a template argument.
\ No newline at end of file
diff --git a/docs/developer/index.rst b/docs/developer/index.rst
new file mode 100644
index 0000000000..7ee05f98ae
--- /dev/null
+++ b/docs/developer/index.rst
@@ -0,0 +1,10 @@
+Developer Manual
+================
+
+.. toctree::
+   :maxdepth: 1
+
+   Source Code Documentation <apidocs.rst>
+   Building the Documentation <build_doc.rst>
+   Code Style Guide <style.rst>
+   Contributing <contrib.rst>
\ No newline at end of file
diff --git a/docs/developer/style.rst b/docs/developer/style.rst
new file mode 100644
index 0000000000..ddd9ce5197
--- /dev/null
+++ b/docs/developer/style.rst
@@ -0,0 +1,34 @@
+Style Guide
+===========
+
+We follow google's c++ coding style. See https://google.github.io/styleguide/cppguide.html and https://github.com/kokkos/kokkos-kernels/blob/master/.clang-format for details. 
+
+.. code-block::
+    :caption: Automate coding style via a pre-commit hook
+
+        cat kokkos-kernels/.git/hooks/pre-commit
+        for FILE in $(git diff --cached --name-only | egrep '.*\.cpp$|.*\.hpp$|.*\.h$')
+        do
+        if [ -e $file ]; then
+            clang-format-8 -i -style=file $FILE
+            git add $FILEA
+        fi
+        done
+        chmod +x kokkos-kernels/.git/hooks/pre-commit
+
+.. code-block::
+    :caption: Conditionally enable or disable formatting
+
+        // clang-format off
+        cpp code here
+        // clang-format on
+
+.. code-block::
+    :caption: Instal clang-format on MacOS
+
+        brew install clang-format-8
+
+.. code-block::
+    :caption: Instal clang-format on Ubuntu
+
+        apt install clang-format-8
\ No newline at end of file
diff --git a/docs/developer/write_developer_doc.rst b/docs/developer/write_developer_doc.rst
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/docs/developer/write_user_doc.rst b/docs/developer/write_user_doc.rst
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/docs/index.rst b/docs/index.rst
index 06240595bf..db873e9a3b 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,33 +1,8 @@
-.. Kokkos Kernels documentation master file, created by
-   sphinx-quickstart on Fri Sep 24 13:19:45 2021.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
-Welcome to Kokkos Kernels's documentation!
+Kokkos Kernels documentation: Under Construction
 ==========================================
-
 .. toctree::
    :maxdepth: 2
-   :caption: Contents:
-
-
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
-
-Docs
-====
 
-.. doxygennamespace:: KokkosBlas
-   :project: KokkosKernels
-   :members:
-.. doxygennamespace:: KokkosSparse
-   :project: KokkosKernels
-   :members:
-.. doxygennamespace:: KokkosBatched
-   :project: KokkosKernels
-   :members:
\ No newline at end of file
+   KokkosKernels GitHub Homepage <https://github.com/kokkos/kokkos-kernels>
+   User Manual <https://github.com/kokkos/kokkos-kernels/wiki>
+   Developer Docs <developer/index.rst>
diff --git a/doc/kokkos-promotion.txt b/docs/kokkos-promotion.txt
similarity index 100%
rename from doc/kokkos-promotion.txt
rename to docs/kokkos-promotion.txt
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000000..188f51e62d
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1 @@
+breathe
\ No newline at end of file
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index a0c8c1f564..45fb3a41e1 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -7,3 +7,5 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../test_common)
 #ADD_SUBDIRECTORY(graph)
 ADD_SUBDIRECTORY(wiki)
 ADD_SUBDIRECTORY(gmres)
+ADD_SUBDIRECTORY(batched_solve)
+ADD_SUBDIRECTORY(half)
diff --git a/example/batched_solve/CMakeLists.txt b/example/batched_solve/CMakeLists.txt
new file mode 100644
index 0000000000..2e3ce96523
--- /dev/null
+++ b/example/batched_solve/CMakeLists.txt
@@ -0,0 +1,12 @@
+KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+KOKKOSKERNELS_ADD_EXECUTABLE(
+  static_pivoting
+  SOURCES static_pivoting.cpp
+  )
+
+KOKKOSKERNELS_ADD_EXECUTABLE(
+  team_GMRES
+  SOURCES team_GMRES.cpp
+  )
diff --git a/example/batched_solve/examples_helper.hpp b/example/batched_solve/examples_helper.hpp
new file mode 100644
index 0000000000..41b936a35c
--- /dev/null
+++ b/example/batched_solve/examples_helper.hpp
@@ -0,0 +1,236 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+/// \brief create_saddle_point_matrices:
+///
+///  This function creates the matrices and the rhs of a batched saddle point
+///  systems where A and Y (the right hand side) are as follows:
+///
+///        ___________
+///       |     |   T |
+///       |  B  |  C  |
+///  A =  |-----+-----|
+///       |  C  |  0  |
+///       |_____|_____|
+///
+///        _____
+///       |     |
+///       |  D  |
+///  Y =  |-----|
+///       |  0  |
+///       |_____|
+///
+///  with A in R^{n \times n}, B in R^{(n-n_2) \times (n-n_2)} and
+///  where B and C are computed as follows:
+///
+///  1. A sequence of n-n_2 points of R^{n_dim} is generated randomly:
+///     x^(0), ..., x^(n-n_2-1)
+///  2. Given this sequence, the entries are computed as follows:
+///     B_{(i,j)} = \| x^(i) - x^(j)\|
+///     C_{(0,j)} = 1
+///     C_{(i,j)} = (x^(j))_{(i-1)} for i != 0
+///
+///  3. D is generated randomly.
+///
+/// This function uses a different sequence of x and a different D for every
+/// systems within the batched system.
+///
+/// As a consequence of its definitation, the diagonal of A is 0 for every
+/// entries.
+///
+/// \tparam MatrixViewType: type of the batched matrices
+/// \tparam VectorViewType: type of the batched vectors
+///
+/// \param A [in/out]: a rank 3 view that has to be prealocated that will store
+/// the entries of the batched matrix. \param Y [in/out]: a rank 2 view that has
+/// to be prealocated that will store the entries of the right hand side. \param
+/// n_dim [in]: the dimension of the physical space where the points are
+/// randomly generated (default = 3).
+///
+
+template <typename MatrixViewType, typename VectorViewType>
+void create_saddle_point_matrices(const MatrixViewType &A,
+                                  const VectorViewType &Y,
+                                  const int n_dim = 3) {
+  Kokkos::Random_XorShift64_Pool<
+      typename MatrixViewType::device_type::execution_space>
+      random(13718);
+  const int N   = A.extent(0);
+  const int n   = A.extent(1);
+  const int n_2 = n_dim + 1;
+  const int n_1 = n - n_2;
+
+  MatrixViewType xs("xs", N, n_1, n_dim);
+  VectorViewType ys("ys", N, n_1);
+
+  Kokkos::fill_random(
+      xs, random,
+      Kokkos::reduction_identity<typename MatrixViewType::value_type>::prod());
+  Kokkos::fill_random(
+      ys, random,
+      Kokkos::reduction_identity<typename VectorViewType::value_type>::prod());
+
+  auto xs_host = Kokkos::create_mirror_view(xs);
+  auto ys_host = Kokkos::create_mirror_view(ys);
+  auto A_host  = Kokkos::create_mirror_view(A);
+  auto Y_host  = Kokkos::create_mirror_view(Y);
+
+  Kokkos::deep_copy(xs_host, xs);
+  Kokkos::deep_copy(ys_host, ys);
+
+  for (int i = 0; i < n_1; ++i) {
+    for (int j = 0; j < n_1; ++j) {
+      for (int l = 0; l < N; ++l) {
+        auto xs_i = Kokkos::subview(xs_host, l, i, Kokkos::ALL);
+        auto xs_j = Kokkos::subview(xs_host, l, j, Kokkos::ALL);
+        typename MatrixViewType::value_type d = 0;
+        for (int k = 0; k < n_dim; ++k) d += Kokkos::pow(xs_i(k) - xs_j(k), 2);
+        d               = Kokkos::sqrt(d);
+        A_host(l, i, j) = Kokkos::pow(d, 5);
+      }
+    }
+    for (int l = 0; l < N; ++l) {
+      A_host(l, i, n_1) = (typename MatrixViewType::value_type)1.0;
+      A_host(l, n_1, i) = (typename MatrixViewType::value_type)1.0;
+      for (int k = 0; k < n_dim; ++k) {
+        A_host(l, i, n_1 + k + 1) = xs_host(l, i, k);
+        A_host(l, n_1 + k + 1, i) = xs_host(l, i, k);
+      }
+      Y_host(l, i) = ys_host(l, i);
+    }
+  }
+  for (int i = n_1; i < n; ++i) {
+    for (int l = 0; l < N; ++l) {
+      Y_host(l, i) = (typename MatrixViewType::value_type)0.0;
+    }
+  }
+
+  Kokkos::deep_copy(A, A_host);
+  Kokkos::deep_copy(Y, Y_host);
+
+  Kokkos::fence();
+}
+
+template <typename IntView, typename VectorViewType>
+void create_tridiagonal_batched_matrices(const int nnz, const int BlkSize,
+                                         const int N, const IntView &r,
+                                         const IntView &c,
+                                         const VectorViewType &D,
+                                         const VectorViewType &X,
+                                         const VectorViewType &B) {
+  Kokkos::Random_XorShift64_Pool<
+      typename VectorViewType::device_type::execution_space>
+      random(13718);
+  Kokkos::fill_random(
+      X, random,
+      Kokkos::reduction_identity<typename VectorViewType::value_type>::prod());
+  Kokkos::fill_random(
+      B, random,
+      Kokkos::reduction_identity<typename VectorViewType::value_type>::prod());
+
+  auto D_host = Kokkos::create_mirror_view(D);
+  auto r_host = Kokkos::create_mirror_view(r);
+  auto c_host = Kokkos::create_mirror_view(c);
+
+  r_host(0) = 0;
+
+  int current_col = 0;
+
+  for (int i = 0; i < BlkSize; ++i) {
+    r_host(i + 1) = r_host(i) + (i == 0 || i == (BlkSize - 1) ? 2 : 3);
+  }
+  for (int i = 0; i < nnz; ++i) {
+    if (i % 3 == 0) {
+      for (int l = 0; l < N; ++l) {
+        D_host(l, i) = typename VectorViewType::value_type(2.0);
+      }
+      c_host(i) = current_col;
+      ++current_col;
+    } else {
+      for (int l = 0; l < N; ++l) {
+        D_host(l, i) = typename VectorViewType::value_type(-1.0);
+      }
+      c_host(i) = current_col;
+      if (i % 3 == 1)
+        --current_col;
+      else
+        ++current_col;
+    }
+  }
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(D, D_host);
+  Kokkos::deep_copy(r, r_host);
+  Kokkos::deep_copy(c, c_host);
+
+  Kokkos::fence();
+}
+
+template <class VType, class IntType>
+void getInvDiagFromCRS(const VType &V, const IntType &r, const IntType &c,
+                       const VType &diag) {
+  auto diag_values_host = Kokkos::create_mirror_view(diag);
+  auto values_host      = Kokkos::create_mirror_view(V);
+  auto row_ptr_host     = Kokkos::create_mirror_view(r);
+  auto colIndices_host  = Kokkos::create_mirror_view(c);
+
+  Kokkos::deep_copy(values_host, V);
+  Kokkos::deep_copy(row_ptr_host, r);
+  Kokkos::deep_copy(colIndices_host, c);
+
+  int current_index;
+  int N       = diag.extent(0);
+  int BlkSize = diag.extent(1);
+
+  for (int i = 0; i < BlkSize; ++i) {
+    for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1);
+         ++current_index) {
+      if (colIndices_host(current_index) == i) break;
+    }
+    for (int j = 0; j < N; ++j) {
+      diag_values_host(j, i) = 1. / values_host(j, current_index);
+    }
+  }
+
+  Kokkos::deep_copy(diag, diag_values_host);
+}
\ No newline at end of file
diff --git a/example/batched_solve/static_pivoting.cpp b/example/batched_solve/static_pivoting.cpp
new file mode 100644
index 0000000000..69ab25b62f
--- /dev/null
+++ b/example/batched_solve/static_pivoting.cpp
@@ -0,0 +1,182 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+#include <fstream>
+
+#define KOKKOSKERNELS_DEBUG_LEVEL 0
+
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Timer.hpp"
+#include "Kokkos_Random.hpp"
+#include "Kokkos_UnorderedMap.hpp"
+#include "Kokkos_Sort.hpp"
+
+/// KokkosKernels headers
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+#include "KokkosKernels_IOUtils.hpp"
+
+#include <Kokkos_ArithTraits.hpp>
+#include <KokkosBatched_Util.hpp>
+#include "examples_helper.hpp"
+#include <KokkosBatched_Trsv_Decl.hpp>
+#include <KokkosBatched_Trsv_Serial_Impl.hpp>
+#include <KokkosBatched_Trsv_Team_Impl.hpp>
+#include <KokkosBatched_LU_Decl.hpp>
+#include <KokkosBatched_LU_Serial_Impl.hpp>
+#include <KokkosBatched_LU_Team_Impl.hpp>
+#include "KokkosBatched_Gesv.hpp"
+
+typedef Kokkos::DefaultExecutionSpace exec_space;
+
+template <typename DeviceType, typename AViewType, typename XYViewType>
+struct Functor_TeamTestStaticPivoting {
+  const AViewType _A;
+  const XYViewType _X;
+  const XYViewType _Y;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TeamTestStaticPivoting(const AViewType &A, const XYViewType &X,
+                                 const XYViewType &Y)
+      : _A(A), _X(X), _Y(Y) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    const int matrix_id = static_cast<int>(member.league_rank());
+
+    auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL);
+    auto X = Kokkos::subview(_X, matrix_id, Kokkos::ALL);
+    auto Y = Kokkos::subview(_Y, matrix_id, Kokkos::ALL);
+    member.team_barrier();
+    KokkosBatched::TeamGesv<MemberType,
+                            KokkosBatched::Gesv::StaticPivoting>::invoke(member,
+                                                                         A, X,
+                                                                         Y);
+    member.team_barrier();
+  }
+
+  inline void run() {
+    std::string name("KokkosBatched::Test::StaticPivoting");
+    Kokkos::TeamPolicy<DeviceType> policy(_A.extent(0), Kokkos::AUTO(),
+                                          Kokkos::AUTO());
+
+    using MatrixViewType =
+        Kokkos::View<typename AViewType::non_const_value_type **,
+                     typename AViewType::array_layout,
+                     typename AViewType::execution_space>;
+
+    const int n    = _A.extent(1);
+    size_t bytes_0 = MatrixViewType::shmem_size(n, n + 4);
+
+    policy.set_scratch_size(0, Kokkos::PerTeam(bytes_0));
+
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+  }
+};
+
+template <typename DeviceType, typename AViewType, typename XYViewType>
+struct Functor_SerialTestStaticPivoting {
+  const AViewType _A;
+  const AViewType _tmp;
+  const XYViewType _X;
+  const XYViewType _Y;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_SerialTestStaticPivoting(const AViewType &A, const AViewType &tmp,
+                                   const XYViewType &X, const XYViewType &Y)
+      : _A(A), _tmp(tmp), _X(X), _Y(Y) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(const int &matrix_id) const {
+    auto A   = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL);
+    auto tmp = Kokkos::subview(_tmp, matrix_id, Kokkos::ALL, Kokkos::ALL);
+    auto X   = Kokkos::subview(_X, matrix_id, Kokkos::ALL);
+    auto Y   = Kokkos::subview(_Y, matrix_id, Kokkos::ALL);
+    KokkosBatched::SerialGesv<KokkosBatched::Gesv::StaticPivoting>::invoke(
+        A, X, Y, tmp);
+  }
+
+  inline void run() {
+    std::string name("KokkosBatched::Test::StaticPivoting");
+
+    const int N = _A.extent(0);
+    Kokkos::parallel_for(name.c_str(), N, *this);
+  }
+};
+
+int main(int /*argc*/, char ** /*argv[]*/) {
+  Kokkos::initialize();
+  {
+    using layout = Kokkos::LayoutLeft;
+
+    using AViewType  = Kokkos::View<double ***, layout, exec_space>;
+    using XYViewType = Kokkos::View<double **, layout, exec_space>;
+
+    int N = 1;
+    int n = 10;
+
+    AViewType A("A", N, n, n);
+    AViewType tmp("tmp", N, n, n + 4);
+    XYViewType X("X", N, n);
+    XYViewType Y("Y", N, n);
+
+    create_saddle_point_matrices(A, Y);
+
+    // The matrices are modified by the GESV so we have to copy them if we want
+    // to solve the same systems twice.
+    AViewType A2("A2", N, n, n);
+    XYViewType Y2("Y2", N, n);
+    Kokkos::deep_copy(A2, A);
+    Kokkos::deep_copy(Y2, Y);
+
+    KokkosKernels::Impl::kk_write_3Dview_to_file(A, "A.txt");
+    KokkosKernels::Impl::kk_write_2Dview_to_file(Y, "Y.txt");
+
+    Functor_SerialTestStaticPivoting<exec_space, AViewType, XYViewType>(A, tmp,
+                                                                        X, Y)
+        .run();
+    KokkosKernels::Impl::kk_write_2Dview_to_file(X, "X_serial.txt");
+    Functor_TeamTestStaticPivoting<exec_space, AViewType, XYViewType>(A2, X, Y2)
+        .run();
+    KokkosKernels::Impl::kk_write_2Dview_to_file(X, "X_team.txt");
+  }
+  Kokkos::finalize();
+}
diff --git a/example/batched_solve/team_GMRES.cpp b/example/batched_solve/team_GMRES.cpp
new file mode 100644
index 0000000000..404e573491
--- /dev/null
+++ b/example/batched_solve/team_GMRES.cpp
@@ -0,0 +1,328 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+#include <fstream>
+
+#define KOKKOSKERNELS_DEBUG_LEVEL 0
+
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Timer.hpp"
+#include "Kokkos_Random.hpp"
+#include "Kokkos_UnorderedMap.hpp"
+#include "Kokkos_Sort.hpp"
+
+/// KokkosKernels headers
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+#include "KokkosKernels_IOUtils.hpp"
+
+#include <Kokkos_ArithTraits.hpp>
+#include <KokkosBatched_Util.hpp>
+#include "examples_helper.hpp"
+#include <KokkosBatched_Spmv.hpp>
+#include <KokkosBatched_GMRES.hpp>
+#include <KokkosBatched_CrsMatrix.hpp>
+#include <KokkosBatched_Krylov_Handle.hpp>
+#include <KokkosBatched_JacobiPrec.hpp>
+
+typedef Kokkos::DefaultExecutionSpace exec_space;
+
+template <typename DeviceType, typename ValuesViewType, typename IntView,
+          typename VectorViewType, typename KrylovHandleType, bool UsePrec>
+struct Functor_TestBatchedTeamVectorGMRES {
+  const ValuesViewType _values;
+  const ValuesViewType _diag;
+  const IntView _r;
+  const IntView _c;
+  const VectorViewType _X;
+  const VectorViewType _B;
+  const int _team_size, _vector_length;
+  KrylovHandleType _handle;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamVectorGMRES(
+      const ValuesViewType &values, const IntView &r, const IntView &c,
+      const VectorViewType &X, const VectorViewType &B, const int team_size,
+      const int vector_length, KrylovHandleType &handle)
+      : _values(values),
+        _r(r),
+        _c(c),
+        _X(X),
+        _B(B),
+        _team_size(team_size),
+        _vector_length(vector_length),
+        _handle(handle) {}
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamVectorGMRES(
+      const ValuesViewType &values, const ValuesViewType &diag,
+      const IntView &r, const IntView &c, const VectorViewType &X,
+      const VectorViewType &B, const int team_size, const int vector_length,
+      KrylovHandleType &handle)
+      : _values(values),
+        _diag(diag),
+        _r(r),
+        _c(c),
+        _X(X),
+        _B(B),
+        _team_size(team_size),
+        _vector_length(vector_length),
+        _handle(handle) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    const int first_matrix = _handle.first_index(member.league_rank());
+    const int last_matrix  = _handle.last_index(member.league_rank());
+    using TeamVectorCopy1D =
+        KokkosBatched::TeamVectorCopy<MemberType,
+                                      KokkosBatched::Trans::NoTranspose, 1>;
+
+    auto d = Kokkos::subview(
+        _values, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL);
+    auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+    auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+
+    using ScratchPadIntViewType =
+        Kokkos::View<typename IntView::non_const_value_type *,
+                     typename IntView::array_layout,
+                     typename IntView::execution_space::scratch_memory_space>;
+    using ScratchPadValuesViewType = Kokkos::View<
+        typename ValuesViewType::non_const_value_type **,
+        typename ValuesViewType::array_layout,
+        typename ValuesViewType::execution_space::scratch_memory_space>;
+
+    using Operator =
+        KokkosBatched::CrsMatrix<ValuesViewType, ScratchPadIntViewType>;
+
+    ScratchPadIntViewType tmp_1D_int(member.team_scratch(0),
+                                     _r.extent(0) + _c.extent(0));
+
+    auto r =
+        Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0)));
+    auto c = Kokkos::subview(
+        tmp_1D_int,
+        Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0)));
+
+    TeamVectorCopy1D::invoke(member, _r, r);
+    TeamVectorCopy1D::invoke(member, _c, c);
+    Operator A(d, r, c);
+
+    if (UsePrec) {
+      ScratchPadValuesViewType diag(
+          member.team_scratch(0), last_matrix - first_matrix, _diag.extent(1));
+      using PrecOperator = KokkosBatched::JacobiPrec<ScratchPadValuesViewType>;
+
+      KokkosBatched::TeamVectorCopy<MemberType>::invoke(
+          member,
+          Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix),
+                          Kokkos::ALL),
+          diag);
+      PrecOperator P(diag);
+      P.setComputedInverse();
+
+      KokkosBatched::TeamVectorGMRES<MemberType>::template invoke<
+          Operator, VectorViewType, PrecOperator, KrylovHandleType>(
+          member, A, b, x, P, _handle);
+    } else {
+      KokkosBatched::TeamVectorGMRES<MemberType>::template invoke<
+          Operator, VectorViewType>(member, A, b, x, _handle);
+    }
+  }
+
+  inline double run() {
+    std::string name("KokkosBatched::Test::TeamVectorGMRES");
+    Kokkos::Timer timer;
+    Kokkos::Profiling::pushRegion(name.c_str());
+
+    Kokkos::TeamPolicy<DeviceType> auto_policy(_handle.get_number_of_teams(),
+                                               Kokkos::AUTO(), Kokkos::AUTO());
+    Kokkos::TeamPolicy<DeviceType> tuned_policy(_handle.get_number_of_teams(),
+                                                _team_size, _vector_length);
+    Kokkos::TeamPolicy<DeviceType> policy;
+
+    if (_team_size < 1)
+      policy = auto_policy;
+    else
+      policy = tuned_policy;
+
+    int maximum_iteration = _handle.get_max_iteration();
+
+    using ScalarType = typename ValuesViewType::non_const_value_type;
+    using Layout     = typename ValuesViewType::array_layout;
+    using EXSP       = typename ValuesViewType::execution_space;
+
+    using ViewType2D = Kokkos::View<ScalarType **, Layout, EXSP>;
+
+    size_t bytes_1D =
+        ViewType2D::shmem_size(_handle.get_number_of_systems_per_team(), 1);
+    size_t bytes_row_ptr = IntView::shmem_size(_r.extent(0));
+    size_t bytes_col_idc = IntView::shmem_size(_c.extent(0));
+    size_t bytes_2D_1    = ViewType2D::shmem_size(
+        _handle.get_number_of_systems_per_team(), _X.extent(1));
+    size_t bytes_2D_2 = ViewType2D::shmem_size(
+        _handle.get_number_of_systems_per_team(), maximum_iteration + 1);
+
+    size_t bytes_int  = bytes_row_ptr + bytes_col_idc;
+    size_t bytes_diag = bytes_2D_1;
+    size_t bytes_tmp  = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2;
+
+    policy.set_scratch_size(
+        0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int));
+
+    exec_space().fence();
+    timer.reset();
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    exec_space().fence();
+    double sec = timer.seconds();
+
+    return sec;
+  }
+};
+
+int main(int /*argc*/, char ** /*argv*/) {
+  Kokkos::initialize();
+  {
+    using layout = Kokkos::LayoutLeft;
+
+    using IntView          = Kokkos::View<int *, layout, exec_space>;
+    using AMatrixValueView = Kokkos::View<double **, layout, exec_space>;
+    using XYType           = Kokkos::View<double **, layout, exec_space>;
+
+    std::string name_A = "mat.mm";
+    std::string name_B = "rhs.mm";
+
+    int N, Blk, nnz;
+
+    Blk = 10;
+    N   = 100;
+    nnz = (Blk - 2) * 3 + 2 * 2;
+
+    IntView rowOffsets("rowOffsets", Blk + 1);
+    IntView colIndices("colIndices", nnz);
+    AMatrixValueView values("values", N, nnz);
+    AMatrixValueView diag("diag", N, Blk);
+    XYType x("x", N, Blk);
+    XYType y("y", N, Blk);
+
+    printf("N = %d, Blk = %d, nnz = %d\n", N, Blk, nnz);
+
+    create_tridiagonal_batched_matrices(nnz, Blk, N, rowOffsets, colIndices,
+                                        values, x, y);
+
+    // Replace y by ones:
+    Kokkos::deep_copy(y, 1.);
+
+    // Replace x by zeros:
+    // Kokkos::deep_copy(x, 0.);
+
+    getInvDiagFromCRS(values, rowOffsets, colIndices, diag);
+
+    using ScalarType = typename AMatrixValueView::non_const_value_type;
+    using Layout     = typename AMatrixValueView::array_layout;
+    using EXSP       = typename AMatrixValueView::execution_space;
+
+    using MagnitudeType =
+        typename Kokkos::Details::ArithTraits<ScalarType>::mag_type;
+
+    using Norm2DViewType   = Kokkos::View<MagnitudeType **, Layout, EXSP>;
+    using Scalar3DViewType = Kokkos::View<ScalarType ***, Layout, EXSP>;
+    using IntViewType      = Kokkos::View<int *, Layout, EXSP>;
+
+    using KrylovHandleType =
+        KokkosBatched::KrylovHandle<Norm2DViewType, IntViewType,
+                                    Scalar3DViewType>;
+
+    const int N_team       = 2;
+    const int n_iterations = 150;
+
+    const int team_size      = -1;
+    const int vector_length  = -1;
+    const double tol         = 1e-8;
+    const int ortho_strategy = 0;
+
+    KrylovHandleType handle(N, N_team, n_iterations, true);
+    handle.Arnoldi_view =
+        Scalar3DViewType("", N, n_iterations, Blk + n_iterations + 3);
+
+    handle.set_max_iteration(n_iterations);
+    handle.set_tolerance(tol);
+    handle.set_ortho_strategy(ortho_strategy);
+    handle.set_scratch_pad_level(0);
+    handle.set_compute_last_residual(true);
+
+    double time =
+        Functor_TestBatchedTeamVectorGMRES<exec_space, AMatrixValueView,
+                                           IntView, XYType, KrylovHandleType,
+                                           true>(values, diag, rowOffsets,
+                                                 colIndices, x, y, team_size,
+                                                 vector_length, handle)
+            .run();
+
+    printf("times = %f secondes\n", time);
+
+    for (int i = 0; i < N; ++i) {
+      if (handle.is_converged_host(i)) {
+        std::cout
+            << "System " << i << " converged in "
+            << handle.get_iteration_host(i)
+            << " iterations, the initial absolute norm of the residual was "
+            << handle.get_norm_host(i, 0) << " and is now "
+            << handle.get_last_norm_host(i) << std::endl;
+      } else {
+        std::cout
+            << "System " << i << " did not converge in "
+            << handle.get_max_iteration()
+            << " iterations, the initial absolute norm of the residual was "
+            << handle.get_norm_host(i, 0) << " and is now "
+            << handle.get_last_norm_host(i) << std::endl;
+      }
+    }
+    if (handle.is_converged_host())
+      std::cout << "All the systems have converged." << std::endl;
+    else
+      std::cout << "There is at least one system that did not converge."
+                << std::endl;
+  }
+  Kokkos::finalize();
+}
diff --git a/example/fenl/TestFixture.hpp b/example/fenl/TestFixture.hpp
index 165265b881..54b841c4b6 100644
--- a/example/fenl/TestFixture.hpp
+++ b/example/fenl/TestFixture.hpp
@@ -56,102 +56,101 @@
 namespace Kokkos {
 namespace Example {
 
-template< class Device >
-struct FixtureVerifyElemNodeCoord
-{
-  typedef Device execution_space ;
+template <class Device>
+struct FixtureVerifyElemNodeCoord {
+  typedef Device execution_space;
 
-  typedef struct { size_t success , error ; } value_type ;
+  typedef struct {
+    size_t success, error;
+  } value_type;
 
-  typedef Kokkos::Example::BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemLinear > FixtureType ;
+  typedef Kokkos::Example::BoxElemFixture<
+      Device, Kokkos::Example::BoxElemPart::ElemLinear>
+      FixtureType;
 
-  FixtureType m_fixture ;
+  FixtureType m_fixture;
 
   KOKKOS_INLINE_FUNCTION
-  void init( value_type & update ) const { update.success = update.error = 0 ; }
+  void init(value_type& update) const { update.success = update.error = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join( volatile       value_type & update ,
-             volatile const value_type & input ) const
-    {
-      update.success += input.success ;
-      update.error += input.error ;
-    }
-  
+  void join(value_type& update, const value_type& input) const {
+    update.success += input.success;
+    update.error += input.error;
+  }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( size_t ielem , value_type & update ) const
-  {
-    unsigned node_coord[ FixtureType::ElemNode ][3] ;
-
-    for ( unsigned i = 0 ; i < FixtureType::ElemNode ; ++i ) {
-      const unsigned node_id = m_fixture.elem_node(ielem,i);
-      node_coord[i][0] = m_fixture.node_grid(node_id,0);
-      node_coord[i][1] = m_fixture.node_grid(node_id,1);
-      node_coord[i][2] = m_fixture.node_grid(node_id,2);
+  void operator()(size_t ielem, value_type& update) const {
+    unsigned node_coord[FixtureType::ElemNode][3];
+
+    for (unsigned i = 0; i < FixtureType::ElemNode; ++i) {
+      const unsigned node_id = m_fixture.elem_node(ielem, i);
+      node_coord[i][0]       = m_fixture.node_grid(node_id, 0);
+      node_coord[i][1]       = m_fixture.node_grid(node_id, 1);
+      node_coord[i][2]       = m_fixture.node_grid(node_id, 2);
     }
 
-    int error = 0 ;
-    for ( unsigned i = 1 ; i < FixtureType::ElemNode ; ++i ) {
-      if ( node_coord[0][0] + m_fixture.elem_node_local(i,0) != node_coord[i][0] ||
-           node_coord[0][1] + m_fixture.elem_node_local(i,1) != node_coord[i][1] ||
-           node_coord[0][2] + m_fixture.elem_node_local(i,2) != node_coord[i][2] ) {
-        error = 1 ;
+    int error = 0;
+    for (unsigned i = 1; i < FixtureType::ElemNode; ++i) {
+      if (node_coord[0][0] + m_fixture.elem_node_local(i, 0) !=
+              node_coord[i][0] ||
+          node_coord[0][1] + m_fixture.elem_node_local(i, 1) !=
+              node_coord[i][1] ||
+          node_coord[0][2] + m_fixture.elem_node_local(i, 2) !=
+              node_coord[i][2]) {
+        error = 1;
       }
     }
 
-    if ( error ) {
-      ++update.error ;
-    }
-    else {
-      ++update.success ;
+    if (error) {
+      ++update.error;
+    } else {
+      ++update.success;
     }
   }
 
-  FixtureVerifyElemNodeCoord( const FixtureType & f ) : m_fixture(f) {}
+  FixtureVerifyElemNodeCoord(const FixtureType& f) : m_fixture(f) {}
 };
 
+template <class Device>
+void test_fixture() {
+  typedef Kokkos::Example::BoxElemFixture<
+      Device, Kokkos::Example::BoxElemPart::ElemLinear>
+      FixtureType;
 
-template< class Device >
-void test_fixture()
-{
-  typedef Kokkos::Example::BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemLinear > FixtureType ;
-
-  const Kokkos::Example::BoxElemPart::Decompose
-    decompose = Kokkos::Example::BoxElemPart:: DecomposeElem ; // DecomposeElem | DecomposeNode ;
-
-  const unsigned global_size = 256 ;
-  const unsigned global_nx = 400 ;
-  const unsigned global_ny = 400 ;
-  const unsigned global_nz = 400 ;
+  const Kokkos::Example::BoxElemPart::Decompose decompose =
+      Kokkos::Example::BoxElemPart::DecomposeElem;  // DecomposeElem |
+                                                    // DecomposeNode ;
 
-  for ( unsigned my_rank = 0 ; my_rank < global_size ; ++my_rank ) {
+  const unsigned global_size = 256;
+  const unsigned global_nx   = 400;
+  const unsigned global_ny   = 400;
+  const unsigned global_nz   = 400;
 
-    const FixtureType fixture( decompose , global_size , my_rank , global_nx , global_ny , global_nz );
+  for (unsigned my_rank = 0; my_rank < global_size; ++my_rank) {
+    const FixtureType fixture(decompose, global_size, my_rank, global_nx,
+                              global_ny, global_nz);
 
     // Verify grid coordinates of element's nodes
-    
-    typename FixtureVerifyElemNodeCoord<Device>::value_type result = { 0 , 0 };
 
-    Kokkos::parallel_reduce( fixture.elem_node().extent(0) , FixtureVerifyElemNodeCoord<Device>( fixture ) , result );
+    typename FixtureVerifyElemNodeCoord<Device>::value_type result = {0, 0};
 
-    if ( result.error ) {
+    Kokkos::parallel_reduce(fixture.elem_node().extent(0),
+                            FixtureVerifyElemNodeCoord<Device>(fixture),
+                            result);
+
+    if (result.error) {
       std::cout << "P[" << my_rank << ":" << global_size
                 << "] Fixture elem_node_coord"
                 << " success(" << result.success << ")"
-                << " error(" << result.error << ")"
-                << std::endl ;
+                << " error(" << result.error << ")" << std::endl;
     }
 
     // Check send/recv alignment
-
-
   }
 }
 
-
 } /* namespace Example */
 } /* namespace Kokkos */
 
 #endif /* #ifndef KOKKOS_EXAMPLE_TESTFIXTURE_HPP */
-
diff --git a/example/fenl/fenl_functors.hpp b/example/fenl/fenl_functors.hpp
index 01a4e989da..0a489fa1c0 100644
--- a/example/fenl/fenl_functors.hpp
+++ b/example/fenl/fenl_functors.hpp
@@ -69,44 +69,42 @@ namespace Kokkos {
 namespace Example {
 namespace FENL {
 
-template< class ElemNodeIdView , class CrsGraphType , unsigned ElemNode >
+template <class ElemNodeIdView, class CrsGraphType, unsigned ElemNode>
 class NodeNodeGraph {
-public:
+ public:
+  typedef typename ElemNodeIdView::execution_space execution_space;
+  typedef pair<unsigned, unsigned> key_type;
 
-  typedef typename ElemNodeIdView::execution_space execution_space ;
-  typedef pair<unsigned,unsigned> key_type ;
-
-  typedef Kokkos::UnorderedMap< key_type, void , execution_space > SetType ;
-  typedef typename CrsGraphType::row_map_type::non_const_type  RowMapType ;
-  typedef Kokkos::View< unsigned ,  execution_space >              UnsignedValue ;
+  typedef Kokkos::UnorderedMap<key_type, void, execution_space> SetType;
+  typedef typename CrsGraphType::row_map_type::non_const_type RowMapType;
+  typedef Kokkos::View<unsigned, execution_space> UnsignedValue;
 
   // Static dimensions of 0 generate compiler warnings or errors.
-  typedef Kokkos::View< unsigned*[ElemNode][ElemNode] , execution_space >
-    ElemGraphType ;
-
-private:
-
-  enum PhaseType { FILL_NODE_SET ,
-                   SCAN_NODE_COUNT ,
-                   FILL_GRAPH_ENTRIES ,
-                   SORT_GRAPH_ENTRIES ,
-                   FILL_ELEMENT_GRAPH };
-
-  const unsigned        node_count ;
-  const ElemNodeIdView  elem_node_id ;
-  UnsignedValue         row_total ;
-  RowMapType            row_count ;
-  RowMapType            row_map ;
-  SetType               node_node_set ;
-  PhaseType             phase ;
+  typedef Kokkos::View<unsigned * [ElemNode][ElemNode], execution_space>
+      ElemGraphType;
+
+ private:
+  enum PhaseType {
+    FILL_NODE_SET,
+    SCAN_NODE_COUNT,
+    FILL_GRAPH_ENTRIES,
+    SORT_GRAPH_ENTRIES,
+    FILL_ELEMENT_GRAPH
+  };
 
-public:
+  const unsigned node_count;
+  const ElemNodeIdView elem_node_id;
+  UnsignedValue row_total;
+  RowMapType row_count;
+  RowMapType row_map;
+  SetType node_node_set;
+  PhaseType phase;
 
-  CrsGraphType          graph ;
-  ElemGraphType         elem_graph ;
+ public:
+  CrsGraphType graph;
+  ElemGraphType elem_graph;
 
-  struct Times
-  {
+  struct Times {
     double ratio;
     double fill_node_set;
     double scan_node_count;
@@ -115,139 +113,146 @@ class NodeNodeGraph {
     double fill_element_graph;
   };
 
-  NodeNodeGraph( const ElemNodeIdView & arg_elem_node_id ,
-                 const unsigned         arg_node_count,
-                 Times & results
-               )
-    : node_count(arg_node_count)
-    , elem_node_id( arg_elem_node_id )
-    , row_total( "row_total" )
-    , row_count(Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_count") , node_count ) // will deep_copy to 0 inside loop
-    , row_map( "graph_row_map" , node_count + 1 )
-    , node_node_set()
-    , phase( FILL_NODE_SET )
-    , graph()
-    , elem_graph()
-   {
-      //--------------------------------
-      // Guess at span required for the map:
-
-      Kokkos::Timer wall_clock ;
-
-      wall_clock.reset();
-      phase = FILL_NODE_SET ;
-
-      // upper bound on the span
-      size_t set_span = (28ull * node_count) / 2;
-
-      {
-        // Zero the row count to restart the fill
-        Kokkos::deep_copy( row_count , 0u );
-
-        node_node_set = SetType( set_span );
-
-        // May be larger that requested:
-        set_span = node_node_set.span();
-
-        Kokkos::parallel_for( "kokkos-kernels/example/fenl: NodeNodeGraph" , elem_node_id.extent(0) , *this );
-      }
+  NodeNodeGraph(const ElemNodeIdView& arg_elem_node_id,
+                const unsigned arg_node_count, Times& results)
+      : node_count(arg_node_count),
+        elem_node_id(arg_elem_node_id),
+        row_total("row_total"),
+        row_count(Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_count"),
+                  node_count)  // will deep_copy to 0 inside loop
+        ,
+        row_map("graph_row_map", node_count + 1),
+        node_node_set(),
+        phase(FILL_NODE_SET),
+        graph(),
+        elem_graph() {
+    //--------------------------------
+    // Guess at span required for the map:
+
+    Kokkos::Timer wall_clock;
+
+    wall_clock.reset();
+    phase = FILL_NODE_SET;
+
+    // upper bound on the span
+    size_t set_span = (28ull * node_count) / 2;
 
-      execution_space().fence();
-      results.ratio = (double)node_node_set.size() / (double)node_node_set.span();
-      results.fill_node_set = wall_clock.seconds();
-      //--------------------------------
+    {
+      // Zero the row count to restart the fill
+      Kokkos::deep_copy(row_count, 0u);
 
-      wall_clock.reset();
-      phase = SCAN_NODE_COUNT ;
+      node_node_set = SetType(set_span);
 
-      // Exclusive scan of row_count into row_map
-      // including the final total in the 'node_count + 1' position.
-      // Zero the 'row_count' values.
-      Kokkos::parallel_scan( node_count , *this );
+      // May be larger that requested:
+      set_span = node_node_set.span();
 
-      // Zero the row count for the fill:
-      Kokkos::deep_copy( row_count , 0u );
+      Kokkos::parallel_for("kokkos-kernels/example/fenl: NodeNodeGraph",
+                           elem_node_id.extent(0), *this);
+    }
 
-      unsigned graph_entry_count = 0 ;
+    execution_space().fence();
+    results.ratio = (double)node_node_set.size() / (double)node_node_set.span();
+    results.fill_node_set = wall_clock.seconds();
+    //--------------------------------
 
-      Kokkos::deep_copy( graph_entry_count , row_total );
+    wall_clock.reset();
+    phase = SCAN_NODE_COUNT;
 
-      // Assign graph's row_map and allocate graph's entries
-      graph.row_map = row_map ;
-      graph.entries = typename CrsGraphType::entries_type( "graph_entries" , graph_entry_count );
+    // Exclusive scan of row_count into row_map
+    // including the final total in the 'node_count + 1' position.
+    // Zero the 'row_count' values.
+    Kokkos::parallel_scan(node_count, *this);
 
-      //--------------------------------
-      // Fill graph's entries from the (node,node) set.
+    // Zero the row count for the fill:
+    Kokkos::deep_copy(row_count, 0u);
 
-      execution_space().fence();
-      results.scan_node_count = wall_clock.seconds();
+    unsigned graph_entry_count = 0;
 
-      wall_clock.reset();
-      phase = FILL_GRAPH_ENTRIES ;
-      Kokkos::parallel_for( node_node_set.span() , *this );
+    Kokkos::deep_copy(graph_entry_count, row_total);
 
-      execution_space().fence();
-      results.fill_graph_entries = wall_clock.seconds();
+    // Assign graph's row_map and allocate graph's entries
+    graph.row_map = row_map;
+    graph.entries =
+        typename CrsGraphType::entries_type("graph_entries", graph_entry_count);
 
-      //--------------------------------
-      // Done with the temporary sets and arrays
-      wall_clock.reset();
-      phase = SORT_GRAPH_ENTRIES ;
+    //--------------------------------
+    // Fill graph's entries from the (node,node) set.
 
-      row_total = UnsignedValue();
-      row_count = RowMapType();
-      row_map   = RowMapType();
-      node_node_set.clear();
+    execution_space().fence();
+    results.scan_node_count = wall_clock.seconds();
 
-      //--------------------------------
+    wall_clock.reset();
+    phase = FILL_GRAPH_ENTRIES;
+    Kokkos::parallel_for(node_node_set.span(), *this);
 
-      Kokkos::parallel_for( node_count , *this );
+    execution_space().fence();
+    results.fill_graph_entries = wall_clock.seconds();
 
-      execution_space().fence();
-      results.sort_graph_entries = wall_clock.seconds();
+    //--------------------------------
+    // Done with the temporary sets and arrays
+    wall_clock.reset();
+    phase = SORT_GRAPH_ENTRIES;
 
-      //--------------------------------
-      // Element-to-graph mapping:
-      wall_clock.reset();
-      phase = FILL_ELEMENT_GRAPH ;
-      elem_graph = ElemGraphType("elem_graph", elem_node_id.extent(0) );
-      Kokkos::parallel_for( elem_node_id.extent(0) , *this );
+    row_total = UnsignedValue();
+    row_count = RowMapType();
+    row_map   = RowMapType();
+    node_node_set.clear();
 
-      execution_space().fence();
-      results.fill_element_graph = wall_clock.seconds();
-    }
+    //--------------------------------
+
+    Kokkos::parallel_for(node_count, *this);
+
+    execution_space().fence();
+    results.sort_graph_entries = wall_clock.seconds();
+
+    //--------------------------------
+    // Element-to-graph mapping:
+    wall_clock.reset();
+    phase      = FILL_ELEMENT_GRAPH;
+    elem_graph = ElemGraphType("elem_graph", elem_node_id.extent(0));
+    Kokkos::parallel_for(elem_node_id.extent(0), *this);
+
+    execution_space().fence();
+    results.fill_element_graph = wall_clock.seconds();
+  }
 
   //------------------------------------
   // parallel_for: create map and count row length
 
   KOKKOS_INLINE_FUNCTION
-  void fill_set( const unsigned ielem ) const
-  {
+  void fill_set(const unsigned ielem) const {
     // Loop over element's (row_local_node,col_local_node) pairs:
-    for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) {
-
-      const unsigned row_node = elem_node_id( ielem , row_local_node );
+    for (unsigned row_local_node = 0; row_local_node < elem_node_id.extent(1);
+         ++row_local_node) {
+      const unsigned row_node = elem_node_id(ielem, row_local_node);
 
-      for ( unsigned col_local_node = row_local_node ; col_local_node < elem_node_id.extent(1) ; ++col_local_node ) {
+      for (unsigned col_local_node = row_local_node;
+           col_local_node < elem_node_id.extent(1); ++col_local_node) {
+        const unsigned col_node = elem_node_id(ielem, col_local_node);
 
-        const unsigned col_node = elem_node_id( ielem , col_local_node );
+        // If either node is locally owned then insert the pair into the
+        // unordered map:
 
-        // If either node is locally owned then insert the pair into the unordered map:
+        if (row_node < row_count.extent(0) || col_node < row_count.extent(0)) {
+          const key_type key = (row_node < col_node)
+                                   ? make_pair(row_node, col_node)
+                                   : make_pair(col_node, row_node);
 
-        if ( row_node < row_count.extent(0) || col_node < row_count.extent(0) ) {
-
-          const key_type key = (row_node < col_node) ? make_pair( row_node, col_node ) : make_pair( col_node, row_node ) ;
-
-          const typename SetType::insert_result result = node_node_set.insert( key );
+          const typename SetType::insert_result result =
+              node_node_set.insert(key);
 
           // A successfull insert: the first time this pair was added
-          if ( result.success() ) {
-
+          if (result.success()) {
             // If row node is owned then increment count
-            if ( row_node < row_count.extent(0) ) { atomic_fetch_add( & row_count( row_node ) , 1 ); }
+            if (row_node < row_count.extent(0)) {
+              atomic_fetch_add(&row_count(row_node), 1);
+            }
 
-            // If column node is owned and not equal to row node then increment count
-            if ( col_node < row_count.extent(0) && col_node != row_node ) { atomic_fetch_add( & row_count( col_node ) , 1 ); }
+            // If column node is owned and not equal to row node then increment
+            // count
+            if (col_node < row_count.extent(0) && col_node != row_node) {
+              atomic_fetch_add(&row_count(col_node), 1);
+            }
           }
         }
       }
@@ -255,114 +260,111 @@ class NodeNodeGraph {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void fill_graph_entries( const unsigned iset ) const
-  {
-    if ( node_node_set.valid_at(iset) ) {
+  void fill_graph_entries(const unsigned iset) const {
+    if (node_node_set.valid_at(iset)) {
       // Add each entry to the graph entries.
 
-      const key_type key = node_node_set.key_at(iset) ;
-      const unsigned row_node = key.first ;
-      const unsigned col_node = key.second ;
+      const key_type key      = node_node_set.key_at(iset);
+      const unsigned row_node = key.first;
+      const unsigned col_node = key.second;
 
-      if ( row_node < row_count.extent(0) ) {
-        const unsigned offset = graph.row_map( row_node ) + atomic_fetch_add( & row_count( row_node ) , 1 );
-        graph.entries( offset ) = col_node ;
+      if (row_node < row_count.extent(0)) {
+        const unsigned offset =
+            graph.row_map(row_node) + atomic_fetch_add(&row_count(row_node), 1);
+        graph.entries(offset) = col_node;
       }
 
-      if ( col_node < row_count.extent(0) && col_node != row_node ) {
-        const unsigned offset = graph.row_map( col_node ) + atomic_fetch_add( & row_count( col_node ) , 1 );
-        graph.entries( offset ) = row_node ;
+      if (col_node < row_count.extent(0) && col_node != row_node) {
+        const unsigned offset =
+            graph.row_map(col_node) + atomic_fetch_add(&row_count(col_node), 1);
+        graph.entries(offset) = row_node;
       }
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void sort_graph_entries( const unsigned irow ) const
-  {
-    const unsigned row_beg = graph.row_map( irow );
-    const unsigned row_end = graph.row_map( irow + 1 );
-    for ( unsigned i = row_beg + 1 ; i < row_end ; ++i ) {
+  void sort_graph_entries(const unsigned irow) const {
+    const unsigned row_beg = graph.row_map(irow);
+    const unsigned row_end = graph.row_map(irow + 1);
+    for (unsigned i = row_beg + 1; i < row_end; ++i) {
       const unsigned col = graph.entries(i);
-      unsigned j = i ;
-      for ( ; row_beg < j && col < graph.entries(j-1) ; --j ) {
-        graph.entries(j) = graph.entries(j-1);
+      unsigned j         = i;
+      for (; row_beg < j && col < graph.entries(j - 1); --j) {
+        graph.entries(j) = graph.entries(j - 1);
       }
-      graph.entries(j) = col ;
+      graph.entries(j) = col;
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void fill_elem_graph_map( const unsigned ielem ) const
-  {
-    for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) {
-
-      const unsigned row_node = elem_node_id( ielem , row_local_node );
-
-      for ( unsigned col_local_node = 0 ; col_local_node < elem_node_id.extent(1) ; ++col_local_node ) {
-
-        const unsigned col_node = elem_node_id( ielem , col_local_node );
+  void fill_elem_graph_map(const unsigned ielem) const {
+    for (unsigned row_local_node = 0; row_local_node < elem_node_id.extent(1);
+         ++row_local_node) {
+      const unsigned row_node = elem_node_id(ielem, row_local_node);
 
-        unsigned entry = ~0u ;
+      for (unsigned col_local_node = 0; col_local_node < elem_node_id.extent(1);
+           ++col_local_node) {
+        const unsigned col_node = elem_node_id(ielem, col_local_node);
 
-        if ( row_node + 1 < graph.row_map.extent(0) ) {
+        unsigned entry = ~0u;
 
-          const unsigned entry_end = graph.row_map( row_node + 1 );
+        if (row_node + 1 < graph.row_map.extent(0)) {
+          const unsigned entry_end = graph.row_map(row_node + 1);
 
-          entry = graph.row_map( row_node );
+          entry = graph.row_map(row_node);
 
-          for ( ; entry < entry_end && graph.entries(entry) != col_node ; ++entry );
+          for (; entry < entry_end && graph.entries(entry) != col_node; ++entry)
+            ;
 
-          if ( entry == entry_end ) entry = ~0u ;
+          if (entry == entry_end) entry = ~0u;
         }
 
-        elem_graph( ielem , row_local_node , col_local_node ) = entry ;
+        elem_graph(ielem, row_local_node, col_local_node) = entry;
       }
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const unsigned iwork ) const
-  {
-    if ( phase == FILL_NODE_SET ) {
-      fill_set( iwork );
-    }
-    else if ( phase == FILL_GRAPH_ENTRIES ) {
-      fill_graph_entries( iwork );
-    }
-    else if ( phase == SORT_GRAPH_ENTRIES ) {
-      sort_graph_entries( iwork );
-    }
-    else if ( phase == FILL_ELEMENT_GRAPH ) {
-      fill_elem_graph_map( iwork );
+  void operator()(const unsigned iwork) const {
+    if (phase == FILL_NODE_SET) {
+      fill_set(iwork);
+    } else if (phase == FILL_GRAPH_ENTRIES) {
+      fill_graph_entries(iwork);
+    } else if (phase == SORT_GRAPH_ENTRIES) {
+      sort_graph_entries(iwork);
+    } else if (phase == FILL_ELEMENT_GRAPH) {
+      fill_elem_graph_map(iwork);
     }
   }
 
   //------------------------------------
   // parallel_scan: row offsets
 
-  typedef unsigned value_type ;
+  typedef unsigned value_type;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const unsigned irow , unsigned & update , const bool final ) const
-  {
+  void operator()(const unsigned irow, unsigned& update,
+                  const bool final) const {
     // exclusive scan
-    if ( final ) { row_map( irow ) = update ; }
+    if (final) {
+      row_map(irow) = update;
+    }
 
-    update += row_count( irow );
+    update += row_count(irow);
 
-    if ( final ) {
-      if ( irow + 1 == row_count.extent(0) ) {
-        row_map( irow + 1 ) = update ;
-        row_total()         = update ;
+    if (final) {
+      if (irow + 1 == row_count.extent(0)) {
+        row_map(irow + 1) = update;
+        row_total()       = update;
       }
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void init( unsigned & update ) const { update = 0 ; }
+  void init(unsigned& update) const { update = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join( volatile unsigned & update , const volatile unsigned & input ) const { update += input ; }
+  void join(unsigned& update, const unsigned& input) const { update += input; }
 
   //------------------------------------
 };
@@ -377,222 +379,210 @@ namespace Kokkos {
 namespace Example {
 namespace FENL {
 
-template< class ElemCompType >
+template <class ElemCompType>
 class NodeElemGatherFill {
-public:
-
-  typedef typename ElemCompType::execution_space         execution_space ;
-  typedef typename ElemCompType::vector_type         vector_type ;
-  typedef typename ElemCompType::sparse_matrix_type  sparse_matrix_type ;
-  typedef typename ElemCompType::elem_node_type      elem_node_type ;
-  typedef typename ElemCompType::elem_vectors_type   elem_vectors_type ;
-  typedef typename ElemCompType::elem_matrices_type  elem_matrices_type ;
-  typedef typename ElemCompType::elem_graph_type     elem_graph_type ;
+ public:
+  typedef typename ElemCompType::execution_space execution_space;
+  typedef typename ElemCompType::vector_type vector_type;
+  typedef typename ElemCompType::sparse_matrix_type sparse_matrix_type;
+  typedef typename ElemCompType::elem_node_type elem_node_type;
+  typedef typename ElemCompType::elem_vectors_type elem_vectors_type;
+  typedef typename ElemCompType::elem_matrices_type elem_matrices_type;
+  typedef typename ElemCompType::elem_graph_type elem_graph_type;
 
-  static const unsigned ElemNodeCount = ElemCompType::ElemNodeCount ;
+  static const unsigned ElemNodeCount = ElemCompType::ElemNodeCount;
 
   //------------------------------------
 
-private:
-
-  typedef Kokkos::StaticCrsGraph< unsigned[2] , execution_space >  CrsGraphType ;
-  typedef typename CrsGraphType::row_map_type::non_const_type  RowMapType ;
-  typedef Kokkos::View< unsigned ,  execution_space >              UnsignedValue ;
-
-  enum PhaseType { FILL_NODE_COUNT ,
-                   SCAN_NODE_COUNT ,
-                   FILL_GRAPH_ENTRIES ,
-                   SORT_GRAPH_ENTRIES ,
-                   GATHER_FILL };
-
-  const elem_node_type  elem_node_id ;
-  const elem_graph_type elem_graph ;
-  UnsignedValue         row_total ;
-  RowMapType            row_count ;
-  RowMapType            row_map ;
-  CrsGraphType          graph ;
-  vector_type           residual ;
-  sparse_matrix_type    jacobian ;
-  elem_vectors_type     elem_residual ;
-  elem_matrices_type    elem_jacobian ;
-  PhaseType             phase ;
-
-public:
+ private:
+  typedef Kokkos::StaticCrsGraph<unsigned[2], execution_space> CrsGraphType;
+  typedef typename CrsGraphType::row_map_type::non_const_type RowMapType;
+  typedef Kokkos::View<unsigned, execution_space> UnsignedValue;
+
+  enum PhaseType {
+    FILL_NODE_COUNT,
+    SCAN_NODE_COUNT,
+    FILL_GRAPH_ENTRIES,
+    SORT_GRAPH_ENTRIES,
+    GATHER_FILL
+  };
 
+  const elem_node_type elem_node_id;
+  const elem_graph_type elem_graph;
+  UnsignedValue row_total;
+  RowMapType row_count;
+  RowMapType row_map;
+  CrsGraphType graph;
+  vector_type residual;
+  sparse_matrix_type jacobian;
+  elem_vectors_type elem_residual;
+  elem_matrices_type elem_jacobian;
+  PhaseType phase;
+
+ public:
   NodeElemGatherFill()
-    : elem_node_id()
-    , elem_graph()
-    , row_total()
-    , row_count()
-    , row_map()
-    , graph()
-    , residual()
-    , jacobian()
-    , elem_residual()
-    , elem_jacobian()
-    , phase( FILL_NODE_COUNT )
-    {}
-
-  NodeElemGatherFill( const NodeElemGatherFill & rhs )
-    : elem_node_id(  rhs.elem_node_id )
-    , elem_graph(    rhs.elem_graph )
-    , row_total(     rhs.row_total )
-    , row_count(     rhs.row_count )
-    , row_map(       rhs.row_map )
-    , graph(         rhs.graph )
-    , residual(      rhs.residual )
-    , jacobian(      rhs.jacobian )
-    , elem_residual( rhs.elem_residual )
-    , elem_jacobian( rhs.elem_jacobian )
-    , phase(         rhs.phase )
-    {}
-
-  NodeElemGatherFill( const elem_node_type     & arg_elem_node_id ,
-                      const elem_graph_type    & arg_elem_graph ,
-                      const vector_type        & arg_residual ,
-                      const sparse_matrix_type & arg_jacobian ,
-                      const elem_vectors_type  & arg_elem_residual ,
-                      const elem_matrices_type & arg_elem_jacobian )
-    : elem_node_id( arg_elem_node_id )
-    , elem_graph( arg_elem_graph )
-    , row_total( "row_total" )
-    , row_count( "row_count" , arg_residual.extent(0) )
-    , row_map( "graph_row_map" , arg_residual.extent(0) + 1 )
-    , graph()
-    , residual( arg_residual )
-    , jacobian( arg_jacobian )
-    , elem_residual( arg_elem_residual )
-    , elem_jacobian( arg_elem_jacobian )
-    , phase( FILL_NODE_COUNT )
-    {
-      //--------------------------------
-      // Count node->element relations
-
-      phase = FILL_NODE_COUNT ;
-
-      Kokkos::parallel_for( elem_node_id.extent(0) , *this );
-
-      //--------------------------------
-
-      phase = SCAN_NODE_COUNT ;
-
-      // Exclusive scan of row_count into row_map
-      // including the final total in the 'node_count + 1' position.
-      // Zero the 'row_count' values.
-      Kokkos::parallel_scan( residual.extent(0) , *this );
-
-      // Zero the row count for the fill:
-      Kokkos::deep_copy( row_count , typename RowMapType::value_type(0) );
-
-      unsigned graph_entry_count = 0 ;
-
-      Kokkos::deep_copy( graph_entry_count , row_total );
-
-      // Assign graph's row_map and allocate graph's entries
-      graph.row_map = row_map ;
-
-      typedef typename CrsGraphType::entries_type graph_entries_type ;
-
-      graph.entries = graph_entries_type( "graph_entries" , graph_entry_count );
-
-      //--------------------------------
-      // Fill graph's entries from the (node,node) set.
-
-      phase = FILL_GRAPH_ENTRIES ;
-
-      Kokkos::deep_copy( row_count , 0u );
-      Kokkos::parallel_for( elem_node_id.extent(0) , *this );
-
-      execution_space().fence();
-
-      //--------------------------------
-      // Done with the temporary sets and arrays
-
-      row_total = UnsignedValue();
-      row_count = RowMapType();
-      row_map   = RowMapType();
-
-      //--------------------------------
-
-      phase = SORT_GRAPH_ENTRIES ;
-      Kokkos::parallel_for( residual.extent(0) , *this );
-
-      execution_space().fence();
-
-      phase = GATHER_FILL ;
-    }
-
-  void apply() const
-  {
-    Kokkos::parallel_for( residual.extent(0) , *this );
+      : elem_node_id(),
+        elem_graph(),
+        row_total(),
+        row_count(),
+        row_map(),
+        graph(),
+        residual(),
+        jacobian(),
+        elem_residual(),
+        elem_jacobian(),
+        phase(FILL_NODE_COUNT) {}
+
+  NodeElemGatherFill(const NodeElemGatherFill& rhs)
+      : elem_node_id(rhs.elem_node_id),
+        elem_graph(rhs.elem_graph),
+        row_total(rhs.row_total),
+        row_count(rhs.row_count),
+        row_map(rhs.row_map),
+        graph(rhs.graph),
+        residual(rhs.residual),
+        jacobian(rhs.jacobian),
+        elem_residual(rhs.elem_residual),
+        elem_jacobian(rhs.elem_jacobian),
+        phase(rhs.phase) {}
+
+  NodeElemGatherFill(const elem_node_type& arg_elem_node_id,
+                     const elem_graph_type& arg_elem_graph,
+                     const vector_type& arg_residual,
+                     const sparse_matrix_type& arg_jacobian,
+                     const elem_vectors_type& arg_elem_residual,
+                     const elem_matrices_type& arg_elem_jacobian)
+      : elem_node_id(arg_elem_node_id),
+        elem_graph(arg_elem_graph),
+        row_total("row_total"),
+        row_count("row_count", arg_residual.extent(0)),
+        row_map("graph_row_map", arg_residual.extent(0) + 1),
+        graph(),
+        residual(arg_residual),
+        jacobian(arg_jacobian),
+        elem_residual(arg_elem_residual),
+        elem_jacobian(arg_elem_jacobian),
+        phase(FILL_NODE_COUNT) {
+    //--------------------------------
+    // Count node->element relations
+
+    phase = FILL_NODE_COUNT;
+
+    Kokkos::parallel_for(elem_node_id.extent(0), *this);
+
+    //--------------------------------
+
+    phase = SCAN_NODE_COUNT;
+
+    // Exclusive scan of row_count into row_map
+    // including the final total in the 'node_count + 1' position.
+    // Zero the 'row_count' values.
+    Kokkos::parallel_scan(residual.extent(0), *this);
+
+    // Zero the row count for the fill:
+    Kokkos::deep_copy(row_count, typename RowMapType::value_type(0));
+
+    unsigned graph_entry_count = 0;
+
+    Kokkos::deep_copy(graph_entry_count, row_total);
+
+    // Assign graph's row_map and allocate graph's entries
+    graph.row_map = row_map;
+
+    typedef typename CrsGraphType::entries_type graph_entries_type;
+
+    graph.entries = graph_entries_type("graph_entries", graph_entry_count);
+
+    //--------------------------------
+    // Fill graph's entries from the (node,node) set.
+
+    phase = FILL_GRAPH_ENTRIES;
+
+    Kokkos::deep_copy(row_count, 0u);
+    Kokkos::parallel_for(elem_node_id.extent(0), *this);
+
+    execution_space().fence();
+
+    //--------------------------------
+    // Done with the temporary sets and arrays
+
+    row_total = UnsignedValue();
+    row_count = RowMapType();
+    row_map   = RowMapType();
+
+    //--------------------------------
+
+    phase = SORT_GRAPH_ENTRIES;
+    Kokkos::parallel_for(residual.extent(0), *this);
+
+    execution_space().fence();
+
+    phase = GATHER_FILL;
   }
 
+  void apply() const { Kokkos::parallel_for(residual.extent(0), *this); }
+
   //------------------------------------
   //------------------------------------
   // parallel_for: Count node->element pairs
 
   KOKKOS_INLINE_FUNCTION
-  void fill_node_count( const unsigned ielem ) const
-  {
-    for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) {
+  void fill_node_count(const unsigned ielem) const {
+    for (unsigned row_local_node = 0; row_local_node < elem_node_id.extent(1);
+         ++row_local_node) {
+      const unsigned row_node = elem_node_id(ielem, row_local_node);
 
-      const unsigned row_node = elem_node_id( ielem , row_local_node );
-
-      if ( row_node < row_count.extent(0) ) {
-        atomic_fetch_add( & row_count( row_node ) , 1 );
+      if (row_node < row_count.extent(0)) {
+        atomic_fetch_add(&row_count(row_node), 1);
       }
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void fill_graph_entries( const unsigned ielem ) const
-  {
-    for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) {
-
-      const unsigned row_node = elem_node_id( ielem , row_local_node );
-
-      if ( row_node < row_count.extent(0) ) {
+  void fill_graph_entries(const unsigned ielem) const {
+    for (unsigned row_local_node = 0; row_local_node < elem_node_id.extent(1);
+         ++row_local_node) {
+      const unsigned row_node = elem_node_id(ielem, row_local_node);
 
-        const unsigned offset = graph.row_map( row_node ) + atomic_fetch_add( & row_count( row_node ) , 1 );
+      if (row_node < row_count.extent(0)) {
+        const unsigned offset =
+            graph.row_map(row_node) + atomic_fetch_add(&row_count(row_node), 1);
 
-        graph.entries( offset , 0 ) = ielem ;
-        graph.entries( offset , 1 ) = row_local_node ;
+        graph.entries(offset, 0) = ielem;
+        graph.entries(offset, 1) = row_local_node;
       }
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void sort_graph_entries( const unsigned irow ) const
-  {
-    const unsigned row_beg = graph.row_map( irow );
-    const unsigned row_end = graph.row_map( irow + 1 );
-    for ( unsigned i = row_beg + 1 ; i < row_end ; ++i ) {
-      const unsigned elem  = graph.entries(i,0);
-      const unsigned local = graph.entries(i,1);
-      unsigned j = i ;
-      for ( ; row_beg < j && elem < graph.entries(j-1,0) ; --j ) {
-        graph.entries(j,0) = graph.entries(j-1,0);
-        graph.entries(j,1) = graph.entries(j-1,1);
+  void sort_graph_entries(const unsigned irow) const {
+    const unsigned row_beg = graph.row_map(irow);
+    const unsigned row_end = graph.row_map(irow + 1);
+    for (unsigned i = row_beg + 1; i < row_end; ++i) {
+      const unsigned elem  = graph.entries(i, 0);
+      const unsigned local = graph.entries(i, 1);
+      unsigned j           = i;
+      for (; row_beg < j && elem < graph.entries(j - 1, 0); --j) {
+        graph.entries(j, 0) = graph.entries(j - 1, 0);
+        graph.entries(j, 1) = graph.entries(j - 1, 1);
       }
-      graph.entries(j,0) = elem ;
-      graph.entries(j,1) = local ;
+      graph.entries(j, 0) = elem;
+      graph.entries(j, 1) = local;
     }
   }
 
   //------------------------------------
 
   KOKKOS_INLINE_FUNCTION
-  void gather_fill( const unsigned irow ) const
-  {
+  void gather_fill(const unsigned irow) const {
     const unsigned node_elem_begin = graph.row_map(irow);
-    const unsigned node_elem_end   = graph.row_map(irow+1);
+    const unsigned node_elem_end   = graph.row_map(irow + 1);
 
     //  for each element that a node belongs to
 
-    for ( unsigned i = node_elem_begin ; i < node_elem_end ; i++ ) {
-
-      const unsigned elem_id   = graph.entries( i, 0);
-      const unsigned row_index = graph.entries( i, 1);
+    for (unsigned i = node_elem_begin; i < node_elem_end; i++) {
+      const unsigned elem_id   = graph.entries(i, 0);
+      const unsigned row_index = graph.entries(i, 1);
 
       residual(irow) += elem_residual(elem_id, row_index);
 
@@ -600,10 +590,10 @@ class NodeElemGatherFill {
       //  gather the contents of the element stiffness
       //  matrix that belong in irow
 
-      for ( unsigned j = 0 ; j < ElemNodeCount ; ++j ) {
-        const unsigned A_index = elem_graph( elem_id , row_index , j );
+      for (unsigned j = 0; j < ElemNodeCount; ++j) {
+        const unsigned A_index = elem_graph(elem_id, row_index, j);
 
-        jacobian.values( A_index ) += elem_jacobian( elem_id, row_index, j );
+        jacobian.values(A_index) += elem_jacobian(elem_id, row_index, j);
       }
     }
   }
@@ -611,48 +601,46 @@ class NodeElemGatherFill {
   //------------------------------------
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const unsigned iwork ) const
-  {
-    if ( phase == FILL_NODE_COUNT ) {
-      fill_node_count( iwork );
-    }
-    else if ( phase == FILL_GRAPH_ENTRIES ) {
-      fill_graph_entries( iwork );
-    }
-    else if ( phase == SORT_GRAPH_ENTRIES ) {
-      sort_graph_entries( iwork );
-    }
-    else if ( phase == GATHER_FILL ) {
-      gather_fill( iwork );
+  void operator()(const unsigned iwork) const {
+    if (phase == FILL_NODE_COUNT) {
+      fill_node_count(iwork);
+    } else if (phase == FILL_GRAPH_ENTRIES) {
+      fill_graph_entries(iwork);
+    } else if (phase == SORT_GRAPH_ENTRIES) {
+      sort_graph_entries(iwork);
+    } else if (phase == GATHER_FILL) {
+      gather_fill(iwork);
     }
   }
 
   //------------------------------------
   // parallel_scan: row offsets
 
-  typedef unsigned value_type ;
+  typedef unsigned value_type;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const unsigned irow , unsigned & update , const bool final ) const
-  {
+  void operator()(const unsigned irow, unsigned& update,
+                  const bool final) const {
     // exclusive scan
-    if ( final ) { row_map( irow ) = update ; }
+    if (final) {
+      row_map(irow) = update;
+    }
 
-    update += row_count( irow );
+    update += row_count(irow);
 
-    if ( final ) {
-      if ( irow + 1 == row_count.extent(0) ) {
-        row_map( irow + 1 ) = update ;
-        row_total()         = update ;
+    if (final) {
+      if (irow + 1 == row_count.extent(0)) {
+        row_map(irow + 1) = update;
+        row_total()       = update;
       }
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void init( unsigned & update ) const { update = 0 ; }
+  void init(unsigned& update) const { update = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join( volatile unsigned & update , const volatile unsigned & input ) const { update += input ; }
+  void join(unsigned& update, const unsigned& input) const { update += input; }
 };
 
 } /* namespace FENL */
@@ -665,188 +653,191 @@ namespace Kokkos {
 namespace Example {
 namespace FENL {
 
-template< class FiniteElementMeshType , class SparseMatrixType >
-class ElementComputation ;
-
+template <class FiniteElementMeshType, class SparseMatrixType>
+class ElementComputation;
 
-template< class DeviceType , BoxElemPart::ElemOrder Order , class CoordinateMap ,
-          typename ScalarType , typename OrdinalType , class MemoryTraits , typename SizeType >
+template <class DeviceType, BoxElemPart::ElemOrder Order, class CoordinateMap,
+          typename ScalarType, typename OrdinalType, class MemoryTraits,
+          typename SizeType>
 class ElementComputation<
-  Kokkos::Example::BoxElemFixture< DeviceType , Order , CoordinateMap > ,
-  KokkosSparse::CrsMatrix< ScalarType , OrdinalType , DeviceType , MemoryTraits , SizeType > >
-{
-public:
-
-  typedef Kokkos::Example::BoxElemFixture< DeviceType, Order, CoordinateMap >  mesh_type ;
-  typedef Kokkos::Example::HexElement_Data< mesh_type::ElemNode >              element_data_type ;
-
-  typedef KokkosSparse::CrsMatrix< ScalarType , OrdinalType , DeviceType , MemoryTraits , SizeType >  sparse_matrix_type ;
-  typedef typename sparse_matrix_type::StaticCrsGraphType                                       sparse_graph_type ;
-
-  typedef DeviceType   execution_space ;
-  typedef ScalarType   scalar_type ;
-
-  static const unsigned SpatialDim       = element_data_type::spatial_dimension ;
-  static const unsigned TensorDim        = SpatialDim * SpatialDim ;
-  static const unsigned ElemNodeCount    = element_data_type::element_node_count ;
-  static const unsigned FunctionCount    = element_data_type::function_count ;
-  static const unsigned IntegrationCount = element_data_type::integration_count ;
+    Kokkos::Example::BoxElemFixture<DeviceType, Order, CoordinateMap>,
+    KokkosSparse::CrsMatrix<ScalarType, OrdinalType, DeviceType, MemoryTraits,
+                            SizeType> > {
+ public:
+  typedef Kokkos::Example::BoxElemFixture<DeviceType, Order, CoordinateMap>
+      mesh_type;
+  typedef Kokkos::Example::HexElement_Data<mesh_type::ElemNode>
+      element_data_type;
+
+  typedef KokkosSparse::CrsMatrix<ScalarType, OrdinalType, DeviceType,
+                                  MemoryTraits, SizeType>
+      sparse_matrix_type;
+  typedef typename sparse_matrix_type::StaticCrsGraphType sparse_graph_type;
+
+  typedef DeviceType execution_space;
+  typedef ScalarType scalar_type;
+
+  static const unsigned SpatialDim    = element_data_type::spatial_dimension;
+  static const unsigned TensorDim     = SpatialDim * SpatialDim;
+  static const unsigned ElemNodeCount = element_data_type::element_node_count;
+  static const unsigned FunctionCount = element_data_type::function_count;
+  static const unsigned IntegrationCount = element_data_type::integration_count;
 
   //------------------------------------
 
-  typedef typename mesh_type::node_coord_type                                      node_coord_type ;
-  typedef typename mesh_type::elem_node_type                                       elem_node_type ;
-  typedef Kokkos::View< scalar_type*[FunctionCount][FunctionCount] , execution_space > elem_matrices_type ;
-  typedef Kokkos::View< scalar_type*[FunctionCount] ,                execution_space > elem_vectors_type ;
-  typedef Kokkos::View< scalar_type* ,                               execution_space > vector_type ;
+  typedef typename mesh_type::node_coord_type node_coord_type;
+  typedef typename mesh_type::elem_node_type elem_node_type;
+  typedef Kokkos::View<scalar_type * [FunctionCount][FunctionCount],
+                       execution_space>
+      elem_matrices_type;
+  typedef Kokkos::View<scalar_type * [FunctionCount], execution_space>
+      elem_vectors_type;
+  typedef Kokkos::View<scalar_type*, execution_space> vector_type;
 
-  typedef typename NodeNodeGraph< elem_node_type , sparse_graph_type , ElemNodeCount >::ElemGraphType elem_graph_type ;
+  typedef typename NodeNodeGraph<elem_node_type, sparse_graph_type,
+                                 ElemNodeCount>::ElemGraphType elem_graph_type;
 
   //------------------------------------
 
-
   //------------------------------------
   // Computational data:
 
-  const element_data_type   elem_data ;
-  const elem_node_type      elem_node_ids ;
-  const node_coord_type     node_coords ;
-  const elem_graph_type     elem_graph ;
-  const elem_matrices_type  elem_jacobians ;
-  const elem_vectors_type   elem_residuals ;
-  const vector_type         solution ;
-  const vector_type         residual ;
-  const sparse_matrix_type  jacobian ;
-  const scalar_type         coeff_K ;
-
-  ElementComputation( const ElementComputation & rhs )
-    : elem_data()
-    , elem_node_ids( rhs.elem_node_ids )
-    , node_coords(   rhs.node_coords )
-    , elem_graph(    rhs.elem_graph )
-    , elem_jacobians( rhs.elem_jacobians )
-    , elem_residuals( rhs.elem_residuals )
-    , solution( rhs.solution )
-    , residual( rhs.residual )
-    , jacobian( rhs.jacobian )
-    , coeff_K( rhs.coeff_K )
-    {}
+  const element_data_type elem_data;
+  const elem_node_type elem_node_ids;
+  const node_coord_type node_coords;
+  const elem_graph_type elem_graph;
+  const elem_matrices_type elem_jacobians;
+  const elem_vectors_type elem_residuals;
+  const vector_type solution;
+  const vector_type residual;
+  const sparse_matrix_type jacobian;
+  const scalar_type coeff_K;
+
+  ElementComputation(const ElementComputation& rhs)
+      : elem_data(),
+        elem_node_ids(rhs.elem_node_ids),
+        node_coords(rhs.node_coords),
+        elem_graph(rhs.elem_graph),
+        elem_jacobians(rhs.elem_jacobians),
+        elem_residuals(rhs.elem_residuals),
+        solution(rhs.solution),
+        residual(rhs.residual),
+        jacobian(rhs.jacobian),
+        coeff_K(rhs.coeff_K) {}
 
   // If the element->sparse_matrix graph is provided then perform atomic updates
-  // Otherwise fill per-element contributions for subequent gather-add into a residual and jacobian.
-  ElementComputation( const mesh_type          & arg_mesh ,
-	              const scalar_type          arg_coeff_K ,
-                      const vector_type        & arg_solution ,
-                      const elem_graph_type    & arg_elem_graph ,
-                      const sparse_matrix_type & arg_jacobian ,
-                      const vector_type        & arg_residual )
-    : elem_data()
-    , elem_node_ids( arg_mesh.elem_node() )
-    , node_coords(   arg_mesh.node_coord() )
-    , elem_graph(    arg_elem_graph )
-    , elem_jacobians()
-    , elem_residuals()
-    , solution( arg_solution )
-    , residual( arg_residual )
-    , jacobian( arg_jacobian )
-    , coeff_K( arg_coeff_K )
-    {}
-
-  ElementComputation( const mesh_type    & arg_mesh ,
-	              const scalar_type    arg_coeff_K ,
-                      const vector_type  & arg_solution )
-    : elem_data()
-    , elem_node_ids( arg_mesh.elem_node() )
-    , node_coords(   arg_mesh.node_coord() )
-    , elem_graph()
-    , elem_jacobians( "elem_jacobians" , arg_mesh.elem_count() )
-    , elem_residuals( "elem_residuals" , arg_mesh.elem_count() )
-    , solution( arg_solution )
-    , residual()
-    , jacobian()
-    , coeff_K( arg_coeff_K )
-    {}
+  // Otherwise fill per-element contributions for subequent gather-add into a
+  // residual and jacobian.
+  ElementComputation(const mesh_type& arg_mesh, const scalar_type arg_coeff_K,
+                     const vector_type& arg_solution,
+                     const elem_graph_type& arg_elem_graph,
+                     const sparse_matrix_type& arg_jacobian,
+                     const vector_type& arg_residual)
+      : elem_data(),
+        elem_node_ids(arg_mesh.elem_node()),
+        node_coords(arg_mesh.node_coord()),
+        elem_graph(arg_elem_graph),
+        elem_jacobians(),
+        elem_residuals(),
+        solution(arg_solution),
+        residual(arg_residual),
+        jacobian(arg_jacobian),
+        coeff_K(arg_coeff_K) {}
+
+  ElementComputation(const mesh_type& arg_mesh, const scalar_type arg_coeff_K,
+                     const vector_type& arg_solution)
+      : elem_data(),
+        elem_node_ids(arg_mesh.elem_node()),
+        node_coords(arg_mesh.node_coord()),
+        elem_graph(),
+        elem_jacobians("elem_jacobians", arg_mesh.elem_count()),
+        elem_residuals("elem_residuals", arg_mesh.elem_count()),
+        solution(arg_solution),
+        residual(),
+        jacobian(),
+        coeff_K(arg_coeff_K) {}
 
   //------------------------------------
 
-  void apply() const
-  {
-    parallel_for( elem_node_ids.extent(0) , *this );
-  }
+  void apply() const { parallel_for(elem_node_ids.extent(0), *this); }
 
   //------------------------------------
 
   static const unsigned FLOPS_transform_gradients =
-     /* Jacobian */           FunctionCount * TensorDim * 2 +
-     /* Inverse jacobian */   TensorDim * 6 + 6 +
-     /* Gradient transform */ FunctionCount * 15 ;
+      /* Jacobian */ FunctionCount * TensorDim * 2 +
+      /* Inverse jacobian */ TensorDim * 6 + 6 +
+      /* Gradient transform */ FunctionCount * 15;
 
   KOKKOS_INLINE_FUNCTION
   float transform_gradients(
-    const float grad[][ FunctionCount ] , // Gradient of bases master element
-    const double x[] ,
-    const double y[] ,
-    const double z[] ,
-    float dpsidx[] ,
-    float dpsidy[] ,
-    float dpsidz[] ) const
-  {
-    enum { j11 = 0 , j12 = 1 , j13 = 2 ,
-           j21 = 3 , j22 = 4 , j23 = 5 ,
-           j31 = 6 , j32 = 7 , j33 = 8 };
+      const float grad[][FunctionCount],  // Gradient of bases master element
+      const double x[], const double y[], const double z[], float dpsidx[],
+      float dpsidy[], float dpsidz[]) const {
+    enum {
+      j11 = 0,
+      j12 = 1,
+      j13 = 2,
+      j21 = 3,
+      j22 = 4,
+      j23 = 5,
+      j31 = 6,
+      j32 = 7,
+      j33 = 8
+    };
 
     // Jacobian accumulation:
 
-    double J[ TensorDim ] = { 0, 0, 0,  0, 0, 0,  0, 0, 0 };
+    double J[TensorDim] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
 
-    for( unsigned i = 0; i < FunctionCount ; ++i ) {
-      const double x1 = x[i] ;
-      const double x2 = y[i] ;
-      const double x3 = z[i] ;
+    for (unsigned i = 0; i < FunctionCount; ++i) {
+      const double x1 = x[i];
+      const double x2 = y[i];
+      const double x3 = z[i];
 
-      const float g1 = grad[0][i] ;
-      const float g2 = grad[1][i] ;
-      const float g3 = grad[2][i] ;
+      const float g1 = grad[0][i];
+      const float g2 = grad[1][i];
+      const float g3 = grad[2][i];
 
-      J[j11] += g1 * x1 ;
-      J[j12] += g1 * x2 ;
-      J[j13] += g1 * x3 ;
+      J[j11] += g1 * x1;
+      J[j12] += g1 * x2;
+      J[j13] += g1 * x3;
 
-      J[j21] += g2 * x1 ;
-      J[j22] += g2 * x2 ;
-      J[j23] += g2 * x3 ;
+      J[j21] += g2 * x1;
+      J[j22] += g2 * x2;
+      J[j23] += g2 * x3;
 
-      J[j31] += g3 * x1 ;
-      J[j32] += g3 * x2 ;
-      J[j33] += g3 * x3 ;
+      J[j31] += g3 * x1;
+      J[j32] += g3 * x2;
+      J[j33] += g3 * x3;
     }
 
     // Inverse jacobian:
 
-    float invJ[ TensorDim ] = {
-      static_cast<float>( J[j22] * J[j33] - J[j23] * J[j32] ) ,
-      static_cast<float>( J[j13] * J[j32] - J[j12] * J[j33] ) ,
-      static_cast<float>( J[j12] * J[j23] - J[j13] * J[j22] ) ,
+    float invJ[TensorDim] = {
+        static_cast<float>(J[j22] * J[j33] - J[j23] * J[j32]),
+        static_cast<float>(J[j13] * J[j32] - J[j12] * J[j33]),
+        static_cast<float>(J[j12] * J[j23] - J[j13] * J[j22]),
 
-      static_cast<float>( J[j23] * J[j31] - J[j21] * J[j33] ) ,
-      static_cast<float>( J[j11] * J[j33] - J[j13] * J[j31] ) ,
-      static_cast<float>( J[j13] * J[j21] - J[j11] * J[j23] ) ,
+        static_cast<float>(J[j23] * J[j31] - J[j21] * J[j33]),
+        static_cast<float>(J[j11] * J[j33] - J[j13] * J[j31]),
+        static_cast<float>(J[j13] * J[j21] - J[j11] * J[j23]),
 
-      static_cast<float>( J[j21] * J[j32] - J[j22] * J[j31] ) ,
-      static_cast<float>( J[j12] * J[j31] - J[j11] * J[j32] ) ,
-      static_cast<float>( J[j11] * J[j22] - J[j12] * J[j21] ) };
+        static_cast<float>(J[j21] * J[j32] - J[j22] * J[j31]),
+        static_cast<float>(J[j12] * J[j31] - J[j11] * J[j32]),
+        static_cast<float>(J[j11] * J[j22] - J[j12] * J[j21])};
 
-    const float detJ = J[j11] * invJ[j11] +
-                       J[j21] * invJ[j12] +
-                       J[j31] * invJ[j13] ;
+    const float detJ =
+        J[j11] * invJ[j11] + J[j21] * invJ[j12] + J[j31] * invJ[j13];
 
-    const float detJinv = 1.0 / detJ ;
+    const float detJinv = 1.0 / detJ;
 
-    for ( unsigned i = 0 ; i < TensorDim ; ++i ) { invJ[i] *= detJinv ; }
+    for (unsigned i = 0; i < TensorDim; ++i) {
+      invJ[i] *= detJinv;
+    }
 
     // Transform gradients:
 
-    for( unsigned i = 0; i < FunctionCount ; ++i ) {
+    for (unsigned i = 0; i < FunctionCount; ++i) {
       const float g0 = grad[0][i];
       const float g1 = grad[1][i];
       const float g2 = grad[2][i];
@@ -856,113 +847,101 @@ class ElementComputation<
       dpsidz[i] = g0 * invJ[j31] + g1 * invJ[j32] + g2 * invJ[j33];
     }
 
-    return detJ ;
+    return detJ;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void contributeResidualJacobian(
-    const float coeff_k ,
-    const double dof_values[] ,
-    const float dpsidx[] ,
-    const float dpsidy[] ,
-    const float dpsidz[] ,
-    const float detJ ,
-    const float integ_weight ,
-    const float bases_vals[] ,
-    double elem_res[] ,
-    double elem_mat[][ FunctionCount ] ) const
-  {
-    double value_at_pt = 0 ;
-    double gradx_at_pt = 0 ;
-    double grady_at_pt = 0 ;
-    double gradz_at_pt = 0 ;
-
-    for ( unsigned m = 0 ; m < FunctionCount ; m++ ) {
-      value_at_pt += dof_values[m] * bases_vals[m] ;
-      gradx_at_pt += dof_values[m] * dpsidx[m] ;
-      grady_at_pt += dof_values[m] * dpsidy[m] ;
-      gradz_at_pt += dof_values[m] * dpsidz[m] ;
+  void contributeResidualJacobian(const float coeff_k,
+                                  const double dof_values[],
+                                  const float dpsidx[], const float dpsidy[],
+                                  const float dpsidz[], const float detJ,
+                                  const float integ_weight,
+                                  const float bases_vals[], double elem_res[],
+                                  double elem_mat[][FunctionCount]) const {
+    double value_at_pt = 0;
+    double gradx_at_pt = 0;
+    double grady_at_pt = 0;
+    double gradz_at_pt = 0;
+
+    for (unsigned m = 0; m < FunctionCount; m++) {
+      value_at_pt += dof_values[m] * bases_vals[m];
+      gradx_at_pt += dof_values[m] * dpsidx[m];
+      grady_at_pt += dof_values[m] * dpsidy[m];
+      gradz_at_pt += dof_values[m] * dpsidz[m];
     }
 
-    const scalar_type k_detJ_weight = coeff_k        * detJ * integ_weight ;
-    const double res_val = value_at_pt * value_at_pt * detJ * integ_weight ;
-    const double mat_val = 2.0 * value_at_pt         * detJ * integ_weight ;
+    const scalar_type k_detJ_weight = coeff_k * detJ * integ_weight;
+    const double res_val = value_at_pt * value_at_pt * detJ * integ_weight;
+    const double mat_val = 2.0 * value_at_pt * detJ * integ_weight;
 
-    // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d \Omega $$
-    // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$
+    // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d
+    // \Omega $$
+    // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla
+    // \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$
 
-    for ( unsigned m = 0; m < FunctionCount; ++m) {
-      double * const mat = elem_mat[m] ;
+    for (unsigned m = 0; m < FunctionCount; ++m) {
+      double* const mat       = elem_mat[m];
       const float bases_val_m = bases_vals[m];
-      const float dpsidx_m    = dpsidx[m] ;
-      const float dpsidy_m    = dpsidy[m] ;
-      const float dpsidz_m    = dpsidz[m] ;
-
-      elem_res[m] += k_detJ_weight * ( dpsidx_m * gradx_at_pt +
-                                       dpsidy_m * grady_at_pt +
-                                       dpsidz_m * gradz_at_pt ) +
-                     res_val * bases_val_m ;
-
-      for( unsigned n = 0; n < FunctionCount; n++) {
-
-        mat[n] += k_detJ_weight * ( dpsidx_m * dpsidx[n] +
-                                    dpsidy_m * dpsidy[n] +
-                                    dpsidz_m * dpsidz[n] ) +
+      const float dpsidx_m    = dpsidx[m];
+      const float dpsidy_m    = dpsidy[m];
+      const float dpsidz_m    = dpsidz[m];
+
+      elem_res[m] +=
+          k_detJ_weight * (dpsidx_m * gradx_at_pt + dpsidy_m * grady_at_pt +
+                           dpsidz_m * gradz_at_pt) +
+          res_val * bases_val_m;
+
+      for (unsigned n = 0; n < FunctionCount; n++) {
+        mat[n] += k_detJ_weight * (dpsidx_m * dpsidx[n] + dpsidy_m * dpsidy[n] +
+                                   dpsidz_m * dpsidz[n]) +
                   mat_val * bases_val_m * bases_vals[n];
       }
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const unsigned ielem ) const
-  {
+  void operator()(const unsigned ielem) const {
     // Gather nodal coordinates and solution vector:
 
-    double x[ FunctionCount ] ;
-    double y[ FunctionCount ] ;
-    double z[ FunctionCount ] ;
-    double val[ FunctionCount ] ;
-    unsigned node_index[ ElemNodeCount ];
+    double x[FunctionCount];
+    double y[FunctionCount];
+    double z[FunctionCount];
+    double val[FunctionCount];
+    unsigned node_index[ElemNodeCount];
 
-    for ( unsigned i = 0 ; i < ElemNodeCount ; ++i ) {
-      const unsigned ni = elem_node_ids( ielem , i );
+    for (unsigned i = 0; i < ElemNodeCount; ++i) {
+      const unsigned ni = elem_node_ids(ielem, i);
 
-      node_index[i] = ni ;
+      node_index[i] = ni;
 
-      x[i] = node_coords( ni , 0 );
-      y[i] = node_coords( ni , 1 );
-      z[i] = node_coords( ni , 2 );
+      x[i] = node_coords(ni, 0);
+      y[i] = node_coords(ni, 1);
+      z[i] = node_coords(ni, 2);
 
-      val[i] = solution( ni );
+      val[i] = solution(ni);
     }
 
+    double elem_vec[FunctionCount];
+    double elem_mat[FunctionCount][FunctionCount];
 
-    double elem_vec[ FunctionCount ] ;
-    double elem_mat[ FunctionCount ][ FunctionCount ] ;
-
-    for( unsigned i = 0; i < FunctionCount ; i++ ) {
-      elem_vec[i] = 0 ;
-      for( unsigned j = 0; j < FunctionCount ; j++){
-        elem_mat[i][j] = 0 ;
+    for (unsigned i = 0; i < FunctionCount; i++) {
+      elem_vec[i] = 0;
+      for (unsigned j = 0; j < FunctionCount; j++) {
+        elem_mat[i][j] = 0;
       }
     }
 
+    for (unsigned i = 0; i < IntegrationCount; ++i) {
+      float dpsidx[FunctionCount];
+      float dpsidy[FunctionCount];
+      float dpsidz[FunctionCount];
 
-    for ( unsigned i = 0 ; i < IntegrationCount ; ++i ) {
-      float dpsidx[ FunctionCount ] ;
-      float dpsidy[ FunctionCount ] ;
-      float dpsidz[ FunctionCount ] ;
+      const float detJ = transform_gradients(elem_data.gradients[i], x, y, z,
+                                             dpsidx, dpsidy, dpsidz);
 
-      const float detJ =
-        transform_gradients( elem_data.gradients[i] , x , y , z ,
-                             dpsidx , dpsidy , dpsidz );
-
-      contributeResidualJacobian( coeff_K ,
-                                  val , dpsidx , dpsidy , dpsidz ,
-                                  detJ ,
-                                  elem_data.weights[i] ,
-                                  elem_data.values[i] ,
-                                  elem_vec , elem_mat );
+      contributeResidualJacobian(coeff_K, val, dpsidx, dpsidy, dpsidz, detJ,
+                                 elem_data.weights[i], elem_data.values[i],
+                                 elem_vec, elem_mat);
     }
 
 #if 0
@@ -984,24 +963,23 @@ if ( 1 == ielem ) {
 
 #endif
 
-    if ( ! residual.extent(0) ) {
-      for( unsigned i = 0; i < FunctionCount ; i++){
-        elem_residuals(ielem, i) = elem_vec[i] ;
-        for( unsigned j = 0; j < FunctionCount ; j++){
-          elem_jacobians(ielem, i, j) = elem_mat[i][j] ;
+    if (!residual.extent(0)) {
+      for (unsigned i = 0; i < FunctionCount; i++) {
+        elem_residuals(ielem, i) = elem_vec[i];
+        for (unsigned j = 0; j < FunctionCount; j++) {
+          elem_jacobians(ielem, i, j) = elem_mat[i][j];
         }
       }
-    }
-    else {
-      for( unsigned i = 0 ; i < FunctionCount ; i++ ) {
-        const unsigned row = node_index[i] ;
-        if ( row < residual.extent(0) ) {
-          atomic_fetch_add( & residual( row ) , elem_vec[i] );
-
-          for( unsigned j = 0 ; j < FunctionCount ; j++ ) {
-            const unsigned entry = elem_graph( ielem , i , j );
-            if ( entry != ~0u ) {
-              atomic_fetch_add( & jacobian.values( entry ) , elem_mat[i][j] );
+    } else {
+      for (unsigned i = 0; i < FunctionCount; i++) {
+        const unsigned row = node_index[i];
+        if (row < residual.extent(0)) {
+          atomic_fetch_add(&residual(row), elem_vec[i]);
+
+          for (unsigned j = 0; j < FunctionCount; j++) {
+            const unsigned entry = elem_graph(ielem, i, j);
+            if (entry != ~0u) {
+              atomic_fetch_add(&jacobian.values(entry), elem_mat[i][j]);
             }
           }
         }
@@ -1012,119 +990,114 @@ if ( 1 == ielem ) {
 
 //----------------------------------------------------------------------------
 
-template< class FixtureType , class SparseMatrixType >
-class DirichletComputation ;
+template <class FixtureType, class SparseMatrixType>
+class DirichletComputation;
 
-template< class DeviceType , BoxElemPart::ElemOrder Order , class CoordinateMap ,
-          typename ScalarType , typename OrdinalType , class MemoryTraits , typename SizeType >
+template <class DeviceType, BoxElemPart::ElemOrder Order, class CoordinateMap,
+          typename ScalarType, typename OrdinalType, class MemoryTraits,
+          typename SizeType>
 class DirichletComputation<
-  Kokkos::Example::BoxElemFixture< DeviceType , Order , CoordinateMap > ,
-  KokkosSparse::CrsMatrix< ScalarType , OrdinalType , DeviceType , MemoryTraits , SizeType > >
-{
-public:
-
-  typedef Kokkos::Example::BoxElemFixture< DeviceType, Order, CoordinateMap >  mesh_type ;
-  typedef typename mesh_type::node_coord_type                                  node_coord_type ;
-  typedef typename node_coord_type::value_type                                 scalar_coord_type ;
-
-  typedef KokkosSparse::CrsMatrix< ScalarType , OrdinalType , DeviceType , MemoryTraits , SizeType >  sparse_matrix_type ;
-  typedef typename sparse_matrix_type::StaticCrsGraphType                                       sparse_graph_type ;
-
-  typedef DeviceType   execution_space ;
-  typedef ScalarType   scalar_type ;
+    Kokkos::Example::BoxElemFixture<DeviceType, Order, CoordinateMap>,
+    KokkosSparse::CrsMatrix<ScalarType, OrdinalType, DeviceType, MemoryTraits,
+                            SizeType> > {
+ public:
+  typedef Kokkos::Example::BoxElemFixture<DeviceType, Order, CoordinateMap>
+      mesh_type;
+  typedef typename mesh_type::node_coord_type node_coord_type;
+  typedef typename node_coord_type::value_type scalar_coord_type;
+
+  typedef KokkosSparse::CrsMatrix<ScalarType, OrdinalType, DeviceType,
+                                  MemoryTraits, SizeType>
+      sparse_matrix_type;
+  typedef typename sparse_matrix_type::StaticCrsGraphType sparse_graph_type;
+
+  typedef DeviceType execution_space;
+  typedef ScalarType scalar_type;
 
   //------------------------------------
 
-  typedef Kokkos::View< scalar_type* , execution_space > vector_type ;
+  typedef Kokkos::View<scalar_type*, execution_space> vector_type;
 
   //------------------------------------
   // Computational data:
 
-  const node_coord_type     node_coords ;
-  const vector_type         solution ;
-  const sparse_matrix_type  jacobian ;
-  const vector_type         residual ;
-  const scalar_type         bc_lower_value ;
-  const scalar_type         bc_upper_value ;
-  const scalar_coord_type   bc_lower_limit ;
-  const scalar_coord_type   bc_upper_limit ;
-  const unsigned            bc_plane ;
-  const unsigned            node_count ;
-        bool                init ;
-
-
-  DirichletComputation( const mesh_type          & arg_mesh ,
-                        const vector_type        & arg_solution ,
-                        const sparse_matrix_type & arg_jacobian ,
-                        const vector_type        & arg_residual ,
-                        const unsigned             arg_bc_plane ,
-                        const scalar_type          arg_bc_lower_value ,
-                        const scalar_type          arg_bc_upper_value )
-    : node_coords( arg_mesh.node_coord() )
-    , solution(    arg_solution )
-    , jacobian(    arg_jacobian )
-    , residual(    arg_residual )
-    , bc_lower_value( arg_bc_lower_value )
-    , bc_upper_value( arg_bc_upper_value )
-    , bc_lower_limit( std::numeric_limits<scalar_coord_type>::epsilon() )
-    , bc_upper_limit( scalar_coord_type(1) - std::numeric_limits<scalar_coord_type>::epsilon() )
-    , bc_plane(       arg_bc_plane )
-    , node_count( arg_mesh.node_count_owned() )
-    , init( false )
-    {
-      parallel_for( node_count , *this );
-      init = true ;
-    }
-
-  void apply() const
-  {
-    parallel_for( node_count , *this );
+  const node_coord_type node_coords;
+  const vector_type solution;
+  const sparse_matrix_type jacobian;
+  const vector_type residual;
+  const scalar_type bc_lower_value;
+  const scalar_type bc_upper_value;
+  const scalar_coord_type bc_lower_limit;
+  const scalar_coord_type bc_upper_limit;
+  const unsigned bc_plane;
+  const unsigned node_count;
+  bool init;
+
+  DirichletComputation(const mesh_type& arg_mesh,
+                       const vector_type& arg_solution,
+                       const sparse_matrix_type& arg_jacobian,
+                       const vector_type& arg_residual,
+                       const unsigned arg_bc_plane,
+                       const scalar_type arg_bc_lower_value,
+                       const scalar_type arg_bc_upper_value)
+      : node_coords(arg_mesh.node_coord()),
+        solution(arg_solution),
+        jacobian(arg_jacobian),
+        residual(arg_residual),
+        bc_lower_value(arg_bc_lower_value),
+        bc_upper_value(arg_bc_upper_value),
+        bc_lower_limit(std::numeric_limits<scalar_coord_type>::epsilon()),
+        bc_upper_limit(scalar_coord_type(1) -
+                       std::numeric_limits<scalar_coord_type>::epsilon()),
+        bc_plane(arg_bc_plane),
+        node_count(arg_mesh.node_count_owned()),
+        init(false) {
+    parallel_for(node_count, *this);
+    init = true;
   }
 
+  void apply() const { parallel_for(node_count, *this); }
+
   //------------------------------------
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const unsigned inode ) const
-  {
+  void operator()(const unsigned inode) const {
     //  Apply dirichlet boundary condition on the Solution and Residual vectors.
     //  To maintain the symmetry of the original global stiffness matrix,
     //  zero out the columns that correspond to boundary conditions, and
     //  update the residual vector accordingly
 
     const unsigned iBeg = jacobian.graph.row_map[inode];
-    const unsigned iEnd = jacobian.graph.row_map[inode+1];
+    const unsigned iEnd = jacobian.graph.row_map[inode + 1];
 
-    const scalar_coord_type c = node_coords(inode,bc_plane);
-    const bool bc_lower = c <= bc_lower_limit ;
-    const bool bc_upper = bc_upper_limit <= c ;
+    const scalar_coord_type c = node_coords(inode, bc_plane);
+    const bool bc_lower       = c <= bc_lower_limit;
+    const bool bc_upper       = bc_upper_limit <= c;
 
-    if ( ! init ) {
-      solution(inode) = bc_lower ? bc_lower_value : (
-                        bc_upper ? bc_upper_value : 0 );
-    }
-    else {
-      if ( bc_lower || bc_upper ) {
-
-        residual(inode) = 0 ;
+    if (!init) {
+      solution(inode) =
+          bc_lower ? bc_lower_value : (bc_upper ? bc_upper_value : 0);
+    } else {
+      if (bc_lower || bc_upper) {
+        residual(inode) = 0;
 
         //  zero each value on the row, and leave a one
         //  on the diagonal
 
-        for( unsigned i = iBeg ; i < iEnd ; ++i ) {
-          jacobian.values(i) = int(inode) == int(jacobian.graph.entries(i)) ? 1 : 0 ;
+        for (unsigned i = iBeg; i < iEnd; ++i) {
+          jacobian.values(i) =
+              int(inode) == int(jacobian.graph.entries(i)) ? 1 : 0;
         }
-      }
-      else {
-
+      } else {
         //  Find any columns that are boundary conditions.
         //  Clear them and adjust the residual vector
 
-        for( unsigned i = iBeg ; i < iEnd ; ++i ) {
-          const unsigned       cnode = jacobian.graph.entries(i) ;
-          const scalar_coord_type cc = node_coords(cnode,bc_plane);
+        for (unsigned i = iBeg; i < iEnd; ++i) {
+          const unsigned cnode       = jacobian.graph.entries(i);
+          const scalar_coord_type cc = node_coords(cnode, bc_plane);
 
-          if ( ( cc <= bc_lower_limit ) || ( bc_upper_limit <= cc ) ) {
-            jacobian.values(i) = 0 ;
+          if ((cc <= bc_lower_limit) || (bc_upper_limit <= cc)) {
+            jacobian.values(i) = 0;
           }
         }
       }
@@ -1139,11 +1112,10 @@ class DirichletComputation<
 //----------------------------------------------------------------------------
 
 /* A Cuda-specific specialization for the element computation functor. */
-#if defined( __CUDACC__ )
+#if defined(__CUDACC__)
 // #include <NonlinearElement_Cuda.hpp>
 #endif
 
 //----------------------------------------------------------------------------
 
 #endif /* #ifndef KOKKOS_EXAMPLE_FENLFUNCTORS_HPP */
-
diff --git a/example/gmres/ex_real_A.cpp b/example/gmres/ex_real_A.cpp
index 2c119d2a9c..b3e95605f7 100644
--- a/example/gmres/ex_real_A.cpp
+++ b/example/gmres/ex_real_A.cpp
@@ -42,31 +42,31 @@
 //@HEADER
 */
 
-#include<math.h>
-#include"KokkosKernels_IOUtils.hpp"
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas.hpp>
-#include<KokkosBlas3_trsm.hpp>
-#include<KokkosSparse_spmv.hpp>
+#include <math.h>
+#include "KokkosSparse_IOUtils.hpp"
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas.hpp>
+#include <KokkosBlas3_trsm.hpp>
+#include <KokkosSparse_spmv.hpp>
 
-#include"gmres.hpp"
+#include "gmres.hpp"
 
-int main(int argc, char *argv[]) {
+int main(int argc, char* argv[]) {
   typedef double ST;
   typedef int OT;
-  typedef Kokkos::DefaultExecutionSpace     EXSP;
+  typedef Kokkos::DefaultExecutionSpace EXSP;
 
-  using ViewVectorType = Kokkos::View<ST*,Kokkos::LayoutLeft, EXSP>;
+  using ViewVectorType = Kokkos::View<ST*, Kokkos::LayoutLeft, EXSP>;
 
-  std::string filename("bcsstk09.mtx"); // example matrix
-  std::string ortho("CGS2"); //orthog type
-  int m = 50; //Max subspace size before restarting.
-  double convTol = 1e-10; //Relative residual convergence tolerance.
-  int cycLim = 50; //Maximum number of times to restart the solver. 
-  bool rand_rhs = false; //Generate random right-hand side. 
+  std::string filename("bcsstk09.mtx");  // example matrix
+  std::string ortho("CGS2");             // orthog type
+  int m          = 50;                   // Max subspace size before restarting.
+  double convTol = 1e-10;  // Relative residual convergence tolerance.
+  int cycLim     = 50;     // Maximum number of times to restart the solver.
+  bool rand_rhs  = false;  // Generate random right-hand side.
 
-  for (int i=1;i<argc;++i) {
+  for (int i = 1; i < argc; ++i) {
     const std::string& token = argv[i];
     if (token == std::string("--filename")) filename = argv[++i];
     if (token == std::string("--max-subsp")) m = std::atoi(argv[++i]);
@@ -74,68 +74,84 @@ int main(int argc, char *argv[]) {
     if (token == std::string("--tol")) convTol = std::stod(argv[++i]);
     if (token == std::string("--ortho")) ortho = argv[++i];
     if (token == std::string("--rand_rhs")) rand_rhs = true;
-    if (token == std::string("--help") || token == std::string("-h")){
-      std::cout << "Kokkos GMRES solver options:" << std::endl
-        << "--filename    :  The name of a matrix market (.mtx) file for matrix A (Default bcsstk09.mtx)." << std::endl
-        << "--max-subsp   :  The maximum size of the Kyrlov subspace before restarting (Default 50)." << std::endl
-        << "--max-restarts:  Maximum number of GMRES restarts (Default 50)." << std::endl
-        << "--tol         :  Convergence tolerance.  (Default 1e-10)." << std::endl
-        << "--ortho       :  Type of orthogonalization. Use 'CGS2' or 'MGS'. (Default 'CGS2')" << std::endl
-        << "--rand_rhs    :  Generate a random right-hand side b.  (Else, default uses b = vector of ones.)" << std::endl
-        << "--help  -h    :  Display this help message." << std::endl 
-        << "Example Call  :  ./Gmres.exe --filename Laplace3D100.mtx --tol 1e-5 --max-subsp 100 " << std::endl << std::endl;
-      return 0; }
+    if (token == std::string("--help") || token == std::string("-h")) {
+      std::cout
+          << "Kokkos GMRES solver options:" << std::endl
+          << "--filename    :  The name of a matrix market (.mtx) file for "
+             "matrix A (Default bcsstk09.mtx)."
+          << std::endl
+          << "--max-subsp   :  The maximum size of the Kyrlov subspace before "
+             "restarting (Default 50)."
+          << std::endl
+          << "--max-restarts:  Maximum number of GMRES restarts (Default 50)."
+          << std::endl
+          << "--tol         :  Convergence tolerance.  (Default 1e-10)."
+          << std::endl
+          << "--ortho       :  Type of orthogonalization. Use 'CGS2' or 'MGS'. "
+             "(Default 'CGS2')"
+          << std::endl
+          << "--rand_rhs    :  Generate a random right-hand side b.  (Else, "
+             "default uses b = vector of ones.)"
+          << std::endl
+          << "--help  -h    :  Display this help message." << std::endl
+          << "Example Call  :  ./Gmres.exe --filename Laplace3D100.mtx --tol "
+             "1e-5 --max-subsp 100 "
+          << std::endl
+          << std::endl;
+      return 0;
+    }
   }
   std::cout << "File to process is: " << filename << std::endl;
   std::cout << "Convergence tolerance is: " << convTol << std::endl;
 
   // Set GMRES options:
   GmresOpts<ST> solverOpts;
-  solverOpts.tol = convTol;
-  solverOpts.m = m;
+  solverOpts.tol        = convTol;
+  solverOpts.m          = m;
   solverOpts.maxRestart = cycLim;
-  solverOpts.ortho = ortho;
+  solverOpts.ortho      = ortho;
+  solverOpts.verbose    = false;  // No verbosity needed for most testing
 
-  //Initialize Kokkos AFTER parsing parameters:
+  // Initialize Kokkos AFTER parsing parameters:
   Kokkos::initialize();
   {
-
-  // Read in a matrix Market file and use it to test the Kokkos Operator.
-  KokkosSparse::CrsMatrix<ST, OT, EXSP> A = 
-    KokkosKernels::Impl::read_kokkos_crst_matrix<KokkosSparse::CrsMatrix<ST, OT, EXSP>>(filename.c_str()); 
-
-  int n = A.numRows();
-  ViewVectorType X("X",n); //Solution and initial guess
-  ViewVectorType Wj("Wj",n); //For checking residuals at end.
-  ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"),n);//right-hand side vec
-
-  if(rand_rhs){
-    // Make rhs random.
-    int rand_seed = 123;
-    Kokkos::Random_XorShift64_Pool<> pool(rand_seed); 
-    Kokkos::fill_random(B, pool, -1,1);
-  }
-  else{
-    // Make rhs ones so that results are repeatable:
-    Kokkos::deep_copy(B,1.0);
-  }
-
-  // Run GMRS solve:
-  GmresStats solveStats = gmres<ST, Kokkos::LayoutLeft, EXSP>(A, B, X, solverOpts);
-
-  // Double check residuals at end of solve:
-  ST nrmB = KokkosBlas::nrm2(B);
-  KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax
-  KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. 
-  ST endRes = KokkosBlas::nrm2(B)/nrmB;
-  std::cout << "=========================================" << std::endl;
-  std::cout << "Verify from main: Ending residual is " << endRes << std::endl;
-  std::cout << "Number of iterations is: " << solveStats.numIters << std::endl;
-  std::cout << "Diff of residual from main - residual from solver: " << solveStats.endRelRes - endRes << std::endl;
-  std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl;
-
+    // Read in a matrix Market file and use it to test the Kokkos Operator.
+    KokkosSparse::CrsMatrix<ST, OT, EXSP> A =
+        KokkosSparse::Impl::read_kokkos_crst_matrix<
+            KokkosSparse::CrsMatrix<ST, OT, EXSP>>(filename.c_str());
+
+    int n = A.numRows();
+    ViewVectorType X("X", n);    // Solution and initial guess
+    ViewVectorType Wj("Wj", n);  // For checking residuals at end.
+    ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"),
+                     n);  // right-hand side vec
+
+    if (rand_rhs) {
+      // Make rhs random.
+      int rand_seed = 123;
+      Kokkos::Random_XorShift64_Pool<> pool(rand_seed);
+      Kokkos::fill_random(B, pool, -1, 1);
+    } else {
+      // Make rhs ones so that results are repeatable:
+      Kokkos::deep_copy(B, 1.0);
+    }
+
+    // Run GMRS solve:
+    GmresStats solveStats =
+        gmres<ST, Kokkos::LayoutLeft, EXSP>(A, B, X, solverOpts);
+
+    // Double check residuals at end of solve:
+    ST nrmB = KokkosBlas::nrm2(B);
+    KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj);  // wj = Ax
+    KokkosBlas::axpy(-1.0, Wj, B);                // b = b-Ax.
+    ST endRes = KokkosBlas::nrm2(B) / nrmB;
+    std::cout << "=========================================" << std::endl;
+    std::cout << "Verify from main: Ending residual is " << endRes << std::endl;
+    std::cout << "Number of iterations is: " << solveStats.numIters
+              << std::endl;
+    std::cout << "Diff of residual from main - residual from solver: "
+              << solveStats.endRelRes - endRes << std::endl;
+    std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl;
   }
   Kokkos::finalize();
-
 }
-
diff --git a/example/gmres/gmres.hpp b/example/gmres/gmres.hpp
index 48a6e4ae0d..22b23cde7a 100644
--- a/example/gmres/gmres.hpp
+++ b/example/gmres/gmres.hpp
@@ -117,10 +117,12 @@ struct GmresOpts {
   typename Kokkos::Details::ArithTraits<ScalarType>::mag_type tol;
   int m;
   int maxRestart;
+  bool verbose;
   std::string ortho;
   std::string precSide;
 
-  GmresOpts<ScalarType>() : tol(1e-8), m(50), maxRestart(50), ortho("CGS2") {}
+  GmresOpts<ScalarType>()
+      : tol(1e-8), m(50), maxRestart(50), verbose(true), ortho("CGS2") {}
 };
 
 template <class ScalarType, class Layout, class EXSP, class OrdinalType = int>
@@ -182,7 +184,9 @@ GmresStats gmres(
   MT nrmB, trueRes, relRes, shortRelRes;
   GmresStats myStats;
 
-  std::cout << "Convergence tolerance is: " << opts.tol << std::endl;
+  if (opts.verbose) {
+    std::cout << "Convergence tolerance is: " << opts.tol << std::endl;
+  }
 
   ViewVectorType Xiter(
       "Xiter", n);  // Intermediate solution at iterations before restart.
@@ -229,7 +233,9 @@ GmresStats gmres(
     relRes = 0;
   }
   shortRelRes = relRes;
-  std::cout << "Initial relative residual is: " << relRes << std::endl;
+  if (opts.verbose) {
+    std::cout << "Initial relative residual is: " << relRes << std::endl;
+  }
   if (relRes < opts.tol) {
     converged = true;
   }
@@ -311,8 +317,10 @@ GmresStats gmres(
       GVec_h(j)     = GVec_h(j) * CosVal_h(j);
       shortRelRes   = fabs(GVec_h(j + 1)) / nrmB;
 
-      std::cout << "Shortcut relative residual for iteration "
-                << j + (cycle * m) << " is: " << shortRelRes << std::endl;
+      if (opts.verbose) {
+        std::cout << "Shortcut relative residual for iteration "
+                  << j + (cycle * m) << " is: " << shortRelRes << std::endl;
+      }
       if (tmpNrm <= 1e-14 && shortRelRes >= opts.tol) {
         throw std::runtime_error(
             "GMRES has experienced lucky breakdown, but the residual has not converged.\n\
@@ -359,8 +367,10 @@ GmresStats gmres(
         KokkosBlas::axpy(-one, Wj, Res);                   // r = b-Ax.
         trueRes = KokkosBlas::nrm2(Res);
         relRes  = trueRes / nrmB;
-        std::cout << "True relative residual for iteration " << j + (cycle * m)
-                  << " is : " << relRes << std::endl;
+        if (opts.verbose) {
+          std::cout << "True relative residual for iteration "
+                    << j + (cycle * m) << " is : " << relRes << std::endl;
+        }
         numIters = j + 1;
 
         if (relRes < opts.tol) {
@@ -390,15 +400,21 @@ GmresStats gmres(
   std::cout << "Ending relative residual is: " << relRes << std::endl;
   myStats.endRelRes = static_cast<double>(relRes);
   if (converged) {
-    std::cout << "Solver converged! " << std::endl;
+    if (opts.verbose) {
+      std::cout << "Solver converged! " << std::endl;
+    }
     myStats.convFlagVal = GmresStats::FLAG::Conv;
   } else if (shortRelRes < opts.tol) {
-    std::cout << "Shortcut residual converged, but solver experienced a loss "
-                 "of accuracy."
-              << std::endl;
+    if (opts.verbose) {
+      std::cout << "Shortcut residual converged, but solver experienced a loss "
+                   "of accuracy."
+                << std::endl;
+    }
     myStats.convFlagVal = GmresStats::FLAG::LOA;
   } else {
-    std::cout << "Solver did not converge. :( " << std::endl;
+    if (opts.verbose) {
+      std::cout << "Solver did not converge. :( " << std::endl;
+    }
     myStats.convFlagVal = GmresStats::FLAG::NoConv;
   }
   if (cycle > 0) {
@@ -406,8 +422,10 @@ GmresStats gmres(
   } else {
     myStats.numIters = 0;
   }
-  std::cout << "The solver completed " << myStats.numIters << " iterations."
-            << std::endl;
+  if (opts.verbose) {
+    std::cout << "The solver completed " << myStats.numIters << " iterations."
+              << std::endl;
+  }
 
   Kokkos::Profiling::popRegion();
   return myStats;
diff --git a/example/gmres/test_cmplx_A.cpp b/example/gmres/test_cmplx_A.cpp
index a19d6ad7e1..ad8d19fb03 100644
--- a/example/gmres/test_cmplx_A.cpp
+++ b/example/gmres/test_cmplx_A.cpp
@@ -44,6 +44,7 @@
 
 #include <math.h>
 #include "KokkosKernels_IOUtils.hpp"
+#include "KokkosSparse_IOUtils.hpp"
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Random.hpp>
 #include <KokkosBlas.hpp>
@@ -65,6 +66,7 @@ int main(int /*argc*/, char** /*argv[]*/) {
   solverOpts.tol        = 1e-05;  // Relative residual convergence tolerance.
   solverOpts.maxRestart = 60;
   solverOpts.ortho      = "CGS2";  // orthog type
+  solverOpts.verbose    = false;   // No verbosity needed for most testing
   bool pass1            = false;
   bool pass2            = false;
 
@@ -76,7 +78,7 @@ int main(int /*argc*/, char** /*argv[]*/) {
   {
     // Read in a matrix Market file and use it to test the Kokkos Operator.
     KokkosSparse::CrsMatrix<ST, OT, EXSP> A =
-        KokkosKernels::Impl::read_kokkos_crst_matrix<
+        KokkosSparse::Impl::read_kokkos_crst_matrix<
             KokkosSparse::CrsMatrix<ST, OT, EXSP>>(filename.c_str());
 
     int n = A.numRows();
diff --git a/example/gmres/test_prec.cpp b/example/gmres/test_prec.cpp
index 852a735aa6..11122edccd 100644
--- a/example/gmres/test_prec.cpp
+++ b/example/gmres/test_prec.cpp
@@ -42,30 +42,30 @@
 //@HEADER
 */
 
-#include<KokkosSparse_MatrixPrec.hpp>
-#include<Kokkos_Core.hpp>
-#include<gmres.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas.hpp>
-#include<KokkosSparse_spmv.hpp>
+#include <KokkosSparse_MatrixPrec.hpp>
+#include <Kokkos_Core.hpp>
+#include <gmres.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas.hpp>
+#include <KokkosSparse_spmv.hpp>
+#include "KokkosSparse_IOUtils.hpp"
 
-int main(int argc, char *argv[]) {
+int main(int argc, char* argv[]) {
+  typedef double ST;
+  typedef int OT;
+  typedef Kokkos::DefaultExecutionSpace EXSP;
 
-  typedef double                            ST;
-  typedef int                               OT;
-  typedef Kokkos::DefaultExecutionSpace     EXSP;
+  using ViewVectorType = Kokkos::View<ST*, Kokkos::LayoutLeft, EXSP>;
 
-  using ViewVectorType = Kokkos::View<ST*,Kokkos::LayoutLeft, EXSP>;
+  std::string ortho("CGS2");  // orthog type
+  int n          = 1000;      // Matrix size
+  int m          = 50;        // Max subspace size before restarting.
+  double convTol = 1e-10;     // Relative residual convergence tolerance.
+  int cycLim     = 50;        // Maximum number of times to restart the solver.
+  bool rand_rhs  = false;     // Generate random right-hand side.
+  bool pass      = false;
 
-  std::string ortho("CGS2"); //orthog type
-  int n = 1000; //Matrix size
-  int m = 50; //Max subspace size before restarting.
-  double convTol = 1e-10; //Relative residual convergence tolerance.
-  int cycLim = 50; //Maximum number of times to restart the solver. 
-  bool rand_rhs = false; //Generate random right-hand side. 
-  bool pass = false;
-
-  for (int i=1;i<argc;++i) {
+  for (int i = 1; i < argc; ++i) {
     const std::string& token = argv[i];
     if (token == std::string("--mat-size")) n = std::atoi(argv[++i]);
     if (token == std::string("--max-subsp")) m = std::atoi(argv[++i]);
@@ -74,75 +74,98 @@ int main(int argc, char *argv[]) {
     if (token == std::string("--ortho")) ortho = argv[++i];
     if (token == std::string("--rand_rhs")) rand_rhs = true;
     if (token == std::string("--help") || token == std::string("-h")) {
-      std::cout << "Kokkos GMRES solver options:" << std::endl
-        << "--mat-size    :  The size of the nxn test matrix. (Default: n=1000.)" << std::endl
-        << "--max-subsp   :  The maximum size of the Kyrlov subspace before restarting (Default 50)." << std::endl
-        << "--max-restarts:  Maximum number of GMRES restarts (Default 50)." << std::endl
-        << "--tol         :  Convergence tolerance.  (Default 1e-10)." << std::endl
-        << "--ortho       :  Type of orthogonalization. Use 'CGS2' or 'MGS'. (Default 'CGS2')" << std::endl
-        << "--rand_rhs    :  Generate a random right-hand side b.  (Else, default uses b = vector of ones.)" << std::endl
-        << "--help  -h    :  Display this help message." << std::endl 
-        << "Example Call  :  ./Gmres.exe --filename Laplace3D100.mtx --tol 1e-5 --max-subsp 100 " << std::endl << std::endl;
-      return 0; }
+      std::cout
+          << "Kokkos GMRES solver options:" << std::endl
+          << "--mat-size    :  The size of the nxn test matrix. (Default: "
+             "n=1000.)"
+          << std::endl
+          << "--max-subsp   :  The maximum size of the Kyrlov subspace before "
+             "restarting (Default 50)."
+          << std::endl
+          << "--max-restarts:  Maximum number of GMRES restarts (Default 50)."
+          << std::endl
+          << "--tol         :  Convergence tolerance.  (Default 1e-10)."
+          << std::endl
+          << "--ortho       :  Type of orthogonalization. Use 'CGS2' or 'MGS'. "
+             "(Default 'CGS2')"
+          << std::endl
+          << "--rand_rhs    :  Generate a random right-hand side b.  (Else, "
+             "default uses b = vector of ones.)"
+          << std::endl
+          << "--help  -h    :  Display this help message." << std::endl
+          << "Example Call  :  ./Gmres.exe --filename Laplace3D100.mtx --tol "
+             "1e-5 --max-subsp 100 "
+          << std::endl
+          << std::endl;
+      return 0;
+    }
   }
   std::cout << "Convergence tolerance is: " << convTol << std::endl;
 
   // Set GMRES options:
   GmresOpts<ST> solverOpts;
-  solverOpts.tol = convTol;
-  solverOpts.m = m;
+  solverOpts.tol        = convTol;
+  solverOpts.m          = m;
   solverOpts.maxRestart = cycLim;
-  solverOpts.ortho = ortho;
+  solverOpts.ortho      = ortho;
+  solverOpts.verbose    = false;  // No verbosity needed for most testing
 
-  //Initialize Kokkos AFTER parsing parameters:
+  // Initialize Kokkos AFTER parsing parameters:
   Kokkos::initialize();
   {
-  // Generate a diagonal matrix with entries 1, 2, ...., 1000 and its inverse.
-  KokkosSparse::CrsMatrix<ST, OT, EXSP> A = 
-                        KokkosKernels::Impl::kk_generate_diag_matrix<KokkosSparse::CrsMatrix<ST, OT, EXSP>>(n);
-  KokkosSparse::Experimental::MatrixPrec<ST, Kokkos::LayoutLeft, EXSP, OT>  * myPrec = 
-                    new KokkosSparse::Experimental::MatrixPrec<ST, Kokkos::LayoutLeft, EXSP, OT>(
-                    KokkosKernels::Impl::kk_generate_diag_matrix<KokkosSparse::CrsMatrix<ST, OT, EXSP>>(n, true));
+    // Generate a diagonal matrix with entries 1, 2, ...., 1000 and its inverse.
+    KokkosSparse::CrsMatrix<ST, OT, EXSP> A =
+        KokkosSparse::Impl::kk_generate_diag_matrix<
+            KokkosSparse::CrsMatrix<ST, OT, EXSP>>(n);
+    KokkosSparse::Experimental::MatrixPrec<ST, Kokkos::LayoutLeft, EXSP, OT>*
+        myPrec =
+            new KokkosSparse::Experimental::MatrixPrec<ST, Kokkos::LayoutLeft,
+                                                       EXSP, OT>(
+                KokkosSparse::Impl::kk_generate_diag_matrix<
+                    KokkosSparse::CrsMatrix<ST, OT, EXSP>>(n, true));
 
-  ViewVectorType X(Kokkos::view_alloc(Kokkos::WithoutInitializing, "X"),n); //Solution and initial guess
-  ViewVectorType Wj("Wj",n); //For checking residuals at end.
-  ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"),n);//right-hand side vec
-  int rand_seed = 123;
-  Kokkos::Random_XorShift64_Pool<> pool(rand_seed); 
-  Kokkos::fill_random(X, pool, -1,1); //Use non-zero initial guess to test GMRES properties. 
-  if(rand_rhs){
-    Kokkos::fill_random(B, pool, -1,1);
-  }
-  else{
-    // Make rhs ones so that results are repeatable:
-    Kokkos::deep_copy(B,1.0);
-  }
+    ViewVectorType X(Kokkos::view_alloc(Kokkos::WithoutInitializing, "X"),
+                     n);         // Solution and initial guess
+    ViewVectorType Wj("Wj", n);  // For checking residuals at end.
+    ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"),
+                     n);  // right-hand side vec
+    int rand_seed = 123;
+    Kokkos::Random_XorShift64_Pool<> pool(rand_seed);
+    Kokkos::fill_random(
+        X, pool, -1,
+        1);  // Use non-zero initial guess to test GMRES properties.
+    if (rand_rhs) {
+      Kokkos::fill_random(B, pool, -1, 1);
+    } else {
+      // Make rhs ones so that results are repeatable:
+      Kokkos::deep_copy(B, 1.0);
+    }
 
-  GmresStats solveStats = gmres<ST, Kokkos::LayoutLeft, EXSP>(A, B, X, solverOpts, myPrec);
-
-  // Double check residuals at end of solve:
-  ST nrmB = KokkosBlas::nrm2(B);
-  KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax
-  KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. 
-  ST endRes = KokkosBlas::nrm2(B)/nrmB;
-  std::cout << "=========================================" << std::endl;
-  std::cout << "Verify from main: Ending residual is " << endRes << std::endl;
-  std::cout << "Number of iterations is: " << solveStats.numIters << std::endl;
-  std::cout << "Diff of residual from main - residual from solver: " << solveStats.endRelRes - endRes << std::endl;
-  std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl;
-  if( endRes < convTol && solveStats.numIters == 1){
-    pass = true;
-  }
+    GmresStats solveStats =
+        gmres<ST, Kokkos::LayoutLeft, EXSP>(A, B, X, solverOpts, myPrec);
 
+    // Double check residuals at end of solve:
+    ST nrmB = KokkosBlas::nrm2(B);
+    KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj);  // wj = Ax
+    KokkosBlas::axpy(-1.0, Wj, B);                // b = b-Ax.
+    ST endRes = KokkosBlas::nrm2(B) / nrmB;
+    std::cout << "=========================================" << std::endl;
+    std::cout << "Verify from main: Ending residual is " << endRes << std::endl;
+    std::cout << "Number of iterations is: " << solveStats.numIters
+              << std::endl;
+    std::cout << "Diff of residual from main - residual from solver: "
+              << solveStats.endRelRes - endRes << std::endl;
+    std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl;
+    if (endRes < convTol && solveStats.numIters == 1) {
+      pass = true;
+    }
   }
   Kokkos::finalize();
 
-  if( pass ){
+  if (pass) {
     std::cout << "Test passed!" << std::endl;
-  }
-  else{
+  } else {
     std::cout << "Test Failed!" << std::endl;
   }
-  return ( pass ? EXIT_SUCCESS : EXIT_FAILURE );
+  return (pass ? EXIT_SUCCESS : EXIT_FAILURE);
 }
-
diff --git a/example/gmres/test_real_A.cpp b/example/gmres/test_real_A.cpp
index 3f6edd06a3..abfb3f0101 100644
--- a/example/gmres/test_real_A.cpp
+++ b/example/gmres/test_real_A.cpp
@@ -44,6 +44,7 @@
 
 #include <math.h>
 #include "KokkosKernels_IOUtils.hpp"
+#include "KokkosSparse_IOUtils.hpp"
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Random.hpp>
 #include <KokkosBlas.hpp>
@@ -72,6 +73,7 @@ int main(int /*argc*/, char** /*argv[]*/) {
   solverOpts.m          = 15;      // Max subspace size before restarting.
   solverOpts.tol        = 1e-10;   // Relative residual convergence tolerance.
   solverOpts.maxRestart = 50;
+  solverOpts.verbose    = false;  // No verbosity needed for most testing
   bool pass1            = false;
   bool pass2            = false;
 
@@ -88,7 +90,7 @@ int main(int /*argc*/, char** /*argv[]*/) {
     cOT diagDominance = 1;
     nnz               = 10 * numRows;
     sp_matrix_type A =
-        KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+        KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix<
             sp_matrix_type>(numRows, numCols, nnz, 0, ncOT(0.01 * numRows),
                             diagDominance);
 
diff --git a/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp b/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp
index 99b398e40c..e921ed06cd 100644
--- a/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp
+++ b/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp
@@ -526,7 +526,9 @@ int main(int argc, char* argv[]) {
       params.use_openmp;  // Assumption is that use_openmp variable is provided
                           // as number of threads
   const int device_id = 0;
-  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+  Kokkos::initialize(Kokkos::InitializationSettings()
+                         .set_num_threads(num_threads)
+                         .set_device_id(device_id));
 
   // Print out information about the configuration of the run if verbose_level
   // >= 5
diff --git a/example/half/CMakeLists.txt b/example/half/CMakeLists.txt
new file mode 100644
index 0000000000..49553f573f
--- /dev/null
+++ b/example/half/CMakeLists.txt
@@ -0,0 +1,7 @@
+KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+KOKKOSKERNELS_ADD_EXECUTABLE(
+  xpy
+  SOURCES xpy.cpp
+  )
diff --git a/example/half/us-rse-escience-2022-reproducer.sh b/example/half/us-rse-escience-2022-reproducer.sh
new file mode 100755
index 0000000000..8e77f72bc4
--- /dev/null
+++ b/example/half/us-rse-escience-2022-reproducer.sh
@@ -0,0 +1,239 @@
+#!/bin/bash
+################################################################################
+# @Brief: On the specified arch, build and run xpy.
+#
+# Author: Evan Harvey <eharvey@sandia.gov>
+################################################################################
+
+function envprint() {
+  for x in $@; do
+      echo $x:\$$x | envsubst
+  done
+}
+
+function printhelp() {
+  echo "--Usage--"
+  echo "$0 HOST_ARCH <ACCELERATOR_ARCH>"
+  echo "  HOST_ARCH:        POWER9, A64FX, SKX"
+  echo "  ACCELERATOR_ARCH: VOLTA70, AMPERE80"
+  echo ""
+  echo "Invocations used to collect us-rse-escience-2022 results:"
+  echo "env KOKKOS_SRC_DIR=$HOME/KOKKOS.base/kokkos KOKKOSKERNELS_SRC_DIR=$HOME/KOKKOS.base/kokkos-kernels/ KOKKOSKERNELS_SHA=TODO-HEAD-SHA ./us-rse-escience-2022-reproducer.sh POWER9 VOLTA70"
+  echo "env KOKKOS_SRC_DIR=$HOME/KOKKOS.base/kokkos KOKKOSKERNELS_SRC_DIR=$HOME/KOKKOS.base/kokkos-kernels/ KOKKOSKERNELS_SHA=TODO-HEAD-SHA ./us-rse-escience-2022-reproducer.sh AMPERE80"
+}
+
+function earlyexit() {
+   rm -rf $benchmark_dir
+   exit $1
+}
+
+function beval() {
+  local ret=0
+  echo "---------------------------------------------------------------------------------------------------------------"
+  echo "START: \"$@\""
+  if [ $dry_run == "off" ]; then
+    eval $@
+    ret=$PIPESTATUS
+  fi
+  if [ $ret -ne 0 ]; then
+      echo "ERROR: \"$@\""
+      earlyexit 1
+  fi
+  echo "END  : \"$@\""
+  echo "---------------------------------------------------------------------------------------------------------------"
+}
+
+# Handle input args
+export KOKKOS_SRC_DIR=${KOKKOS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos"}
+export KOKKOS_SRC_DIR=$(realpath $KOKKOS_SRC_DIR)
+export KOKKOS_SHA=${KOKKOS_SHA:-"tags/3.6.00"}
+export KOKKOSKERNELS_SRC_DIR=${KOKKOSKERNELS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos-kernels"}
+export KOKKOSKERNELS_SRC_DIR=$(realpath $KOKKOSKERNELS_SRC_DIR)
+envprint KOKKOS_SRC_DIR KOKKOS_SHA KOKKOSKERNELS_SRC_DIR KOKKOSKERNELS_SHA
+
+dry_run="off"
+arch_names="$1 $2"
+echo "HOST_ARCH=\"$1\", ACCELERATOR_ARCH=\"$2\""
+
+# Create benchmark directory
+benchmark_dir=$PWD/$0_$(date +"%Y-%m-%d_%H.%M.%S")
+beval mkdir -p $benchmark_dir/kokkos-{build,install}
+beval mkdir -p $benchmark_dir/kokkos-kernels-{build,install}
+export KOKKOS_BUILD_DIR=$(realpath $benchmark_dir/kokkos-build)
+export KOKKOS_INSTALL_DIR=$(realpath $benchmark_dir/kokkos-install)
+export KOKKOSKERNELS_BUILD_DIR=$(realpath $benchmark_dir/kokkos-kernels-build)
+export KOKKOSKERNELS_INSTALL_DIR=$(realpath $benchmark_dir/kokkos-kernels-install)
+envprint KOKKOS_INSTALL_DIR KOKKOS_BUILD_DIR KOKKOSKERNELS_BUILD_DIR KOKKOSKERNELS_INSTALL_DIR
+
+# Setup arch specific cmake configurations and job submission commands
+if [[ "$arch_names" == " " ]]; then
+    printhelp; earlyexit 1
+elif [ "$arch_names" == "POWER9 VOLTA70" ]; then
+  module purge
+  module load cuda/11.2.0 gcc/8.3.1 cmake/3.18.0
+  kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \
+                     --arch=Power9,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \
+                     --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out"
+  kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \
+                              | tee -a kokkos_config_cmd.out"
+
+  kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \
+                            --arch=Power9,Volta70 --with-cuda=$CUDA_PATH -- --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \
+                            --cxxflags='-O3' --disable-tests --enable-examples \
+                            --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \
+                            --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \
+                            tee kokkoskernels_config_cmd.out"
+  kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \
+                                   -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \
+                                   $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out"
+
+  kokkos_build_cmd="bsub -q normal -W 2:00 -Is $KOKKOS_BUILD_DIR/build.sh"
+  kokkoskernels_build_cmd="bsub -q normal -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/build.sh"
+  benchmark_cmd="bsub -q normal -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/bench.sh"
+elif [ "$arch_names" == "SNB VOLTA70" ]; then
+  module purge
+  module load sems-archive-env sems-env sems-gcc/8.3.0 sems-cmake/3.19.1 cuda/11.2 sems-archive-git/2.10.1
+  kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \
+                     --arch=SNB,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \
+                     --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out"
+  kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \
+                              | tee -a kokkos_config_cmd.out"
+
+  kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \
+                            --arch=SNB,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \
+                            --cxxflags='-O3' --disable-tests --enable-examples \
+                            --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \
+                            --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \
+                            tee kokkoskernels_config_cmd.out"
+  kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \
+                                   -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \
+                                   $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out"
+
+  kokkos_build_cmd="$KOKKOS_BUILD_DIR/build.sh"
+  kokkoskernels_build_cmd="$KOKKOSKERNELS_BUILD_DIR/build.sh"
+  benchmark_cmd="$KOKKOSKERNELS_BUILD_DIR/bench.sh"
+elif [ "$arch_names" == "AMPERE80" ]; then
+  module purge
+  module load cudatoolkit/11.2 cmake/3.22.0
+
+  kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \
+                    --arch=Ampere80 --with-cuda=$CUDA_HOME --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \
+                    --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR &> kokkos_config_cmd.out"
+
+  kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR &>  kokkos_config_cmd.out"
+  kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \
+                           --arch=Ampere80 --with-cuda=$CUDA_HOME --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \
+                           --cxxflags='-O3' --disable-tests --enable-examples \
+                           --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \
+                           --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR &> kokkoskernels_config_cmd.out"
+
+  kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -S $KOKKOSKERNELS_SRC_DIR -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \
+                                  -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF &> kokkoskernels_config_cmd.out"
+
+  kokkos_build_cmd="$KOKKOS_BUILD_DIR/build.sh"
+  kokkoskernels_build_cmd="$KOKKOSKERNELS_BUILD_DIR/build.sh"
+  benchmark_cmd="$KOKKOSKERNELS_BUILD_DIR/bench.sh"
+elif [ "$arch_names" == "A64FX " ]; then
+  export OMP_PROC_BIND=close
+  export OMP_PLACES=cores
+  export OMP_NUM_THREADS=48
+  module purge
+  module load gcc/10.2.0 cmake/3.17.0
+  kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \
+                     --arch=A64FX \
+                     --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out"
+  kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \
+                              | tee -a kokkos_config_cmd.out"
+
+  kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \
+                            --cxxflags='-msve-vector-bits=512 -Ofast' --arch=A64FX --with-openmp \
+                            --disable-tests --enable-examples \
+                            --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \
+                            --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \
+                            tee kokkoskernels_config_cmd.out"
+  kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \
+                                   -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \
+                                   $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out"
+
+  kokkos_build_cmd="srun --time=2:00:00 -N1 $KOKKOS_BUILD_DIR/build.sh"
+  kokkoskernels_build_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/build.sh"
+  benchmark_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/bench.sh"
+elif [ "$arch_names" == "SKX " ]; then
+    export OMP_PROC_BIND=close
+    export OMP_PLACES=cores
+    export OMP_NUM_THREADS=96
+    module purge
+    module load gcc/7.2.0 cmake/3.19.3
+    kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \
+                       --arch=SKX \
+                       --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out"
+    kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \
+                                | tee -a kokkos_config_cmd.out"
+
+    kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \
+                              --cxxflags='-O3' --arch=SKX --with-openmp --disable-tests --enable-examples \
+                              --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \
+                              --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \
+                              tee kokkoskernels_config_cmd.out"
+    kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \
+                                     -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \
+                                     $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out"
+
+    kokkos_build_cmd="srun --time=2:00:00 -N1 $KOKKOS_BUILD_DIR/build.sh"
+    kokkoskernels_build_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/build.sh"
+    benchmark_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/bench.sh"
+    use_simd="--use_simd=1"
+else
+  echo "Invalid arch: $arch_names"
+  printhelp; earlyexit 1
+fi
+
+# Write the arch agnostic kokkos build script
+echo "#!/bin/bash" > $KOKKOS_BUILD_DIR/build.sh
+echo "cd $KOKKOS_BUILD_DIR" >> $KOKKOS_BUILD_DIR/build.sh
+echo "make -j40 install" >> $KOKKOS_BUILD_DIR/build.sh
+chmod +x $KOKKOS_BUILD_DIR/build.sh
+
+# Write the arch agnostic kokkos-kernels build script
+echo "#!/bin/bash" > $KOKKOSKERNELS_BUILD_DIR/build.sh
+echo "cd $KOKKOSKERNELS_BUILD_DIR/example/half" >> $KOKKOSKERNELS_BUILD_DIR/build.sh
+echo "make -j40 xpy" >> $KOKKOSKERNELS_BUILD_DIR/build.sh
+chmod +x $KOKKOSKERNELS_BUILD_DIR/build.sh
+
+# Write the arch agnostic kokkos-kernels benchmark script
+echo "#!/bin/bash" > $KOKKOSKERNELS_BUILD_DIR/bench.sh
+echo "cd $benchmark_dir" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh
+echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 10 0 &> xpy_relative_error-10.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh
+echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 100 0 &> xpy_relative_error-100.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh
+echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 1000 0 &> xpy_relative_error-1000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh
+echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 10000 0 &> xpy_relative_error-10000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh
+echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 100000 0 &> xpy_relative_error-100000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh
+
+echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 50000 1 &> xpy_runtime_only-50000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh
+echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 500000 1 &> xpy_runtime_only-500000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh
+echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 5000000 1 &> xpy_runtime_only-5000000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh
+echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 50000000 1 &> xpy_runtime_only-50000000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh
+echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 500000000 1 &> xpy_runtime_only-500000000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh
+chmod +x $KOKKOSKERNELS_BUILD_DIR/bench.sh
+
+# Check out the correct SHAs
+beval "cd $KOKKOS_SRC_DIR && git checkout $KOKKOS_SHA"
+beval "cd $KOKKOSKERNELS_SRC_DIR && git checkout $KOKKOSKERNELS_SHA"
+
+# Build Kokkos
+beval $kokkos_config_cmd
+beval $kokkos_config_defaults_cmd
+beval $kokkos_build_cmd
+
+# Wait for the file system on the head node to catch up
+while [[ "$arch_names" == "POWER9 VOLTA70" && ! -e $KOKKOS_INSTALL_DIR/bin/nvcc_wrapper ]]; do
+  sleep 3s
+done
+
+# Build KokkosKernels
+beval $kokkoskernels_config_cmd
+beval $kokkoskernels_config_defaults_cmd
+beval $kokkoskernels_build_cmd
+
+# Run the benchmark
+beval $benchmark_cmd
diff --git a/example/half/xpy.cpp b/example/half/xpy.cpp
new file mode 100644
index 0000000000..bc6bf7481d
--- /dev/null
+++ b/example/half/xpy.cpp
@@ -0,0 +1,137 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+#include "KokkosKernels_default_types.hpp"
+
+template <class ViewType>
+struct Functor_xpy {
+  ViewType x, y;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int &i) const { x(i) += y(i); }
+};
+
+template <class ScalarType, class DeviceType, class LayoutType>
+void do_xpy(size_t n, bool time_only = false) {
+  using namespace Kokkos;
+  using ExecutionSpace      = typename DeviceType::execution_space;
+  using ViewType            = View<ScalarType *, LayoutType, DeviceType>;
+  using ReferenceScalarType = double;
+
+  ViewType x("x", n);
+  ViewType y("y", n);
+  View<ReferenceScalarType *, LayoutType, DeviceType> x_rand("x_rand", n);
+  View<ReferenceScalarType *, LayoutType, DeviceType> y_rand("y_rand", n);
+
+  View<ReferenceScalarType *, LayoutType, HostSpace> expected("expected", n);
+  View<ReferenceScalarType *, LayoutType, HostSpace> relative_error(
+      "relative_error", n);
+  typename ViewType::HostMirror x_host = create_mirror_view(x);
+  typename ViewType::HostMirror y_host = create_mirror_view(y);
+  // TODO: Report segfault in random_pool creation with:
+  // typename ViewType::HostMirror y_host = create_mirror_view(y_host);
+
+  Random_XorShift64_Pool<ExecutionSpace> random_pool(12345);
+  fill_random(x_rand, random_pool, ReferenceScalarType(1.0),
+              ReferenceScalarType(2.0));
+  fill_random(y_rand, random_pool, ReferenceScalarType(1.0),
+              ReferenceScalarType(2.0));
+  ExecutionSpace().fence();
+
+  deep_copy(x, x_rand);
+  deep_copy(y, y_rand);
+  ExecutionSpace().fence();
+
+  deep_copy(x_host, x);
+  deep_copy(y_host, y);
+  ExecutionSpace().fence();
+
+  Functor_xpy<ViewType> xpy;
+  xpy.x = x;
+  xpy.y = y;
+  Timer timer;
+  parallel_for("xpy", n, xpy);
+  ExecutionSpace().fence();
+  double s = timer.seconds();
+
+  if (!time_only) {
+    for (size_t i = 0; i < n; i++)
+      expected(i) = static_cast<ReferenceScalarType>(y_host(i)) +
+                    static_cast<ReferenceScalarType>(x_host(i));
+  }
+
+  deep_copy(x_host, x);
+  ExecutionSpace().fence();
+
+  std::cout << "n: " << n << ", " << typeid(ScalarType).name()
+            << " Runtime(s): " << s << std::endl;
+
+  if (!time_only) {
+    std::cout << "n: " << n << ", " << typeid(ScalarType).name()
+              << " Relative Errors:" << std::endl;
+    for (size_t i = 0; i < n; i++) {
+      std::cout << ", " << std::abs(expected(i) - x_host(i)) / expected(i)
+                << std::endl;
+    }
+    std::cout << std::endl << std::endl;
+  }
+}
+
+int main(int argc, char **argv) {
+  Kokkos::initialize();
+  if (argc < 2) {
+    std::cout << "./" << argv[0] << " N:Z TIME_ONLY:{0,1}" << std::endl;
+    Kokkos::finalize();
+    return 1;
+  }
+  using LayoutType = Kokkos::LayoutLeft;
+  using DeviceType = default_device;
+  size_t n         = atoi(argv[1]);
+  bool time_only   = static_cast<bool>(atoi(argv[2]));
+  do_xpy<float, DeviceType, LayoutType>(n, time_only);
+  do_xpy<Kokkos::Experimental::half_t, DeviceType, LayoutType>(n, time_only);
+  do_xpy<Kokkos::Experimental::bhalf_t, DeviceType, LayoutType>(n, time_only);
+  Kokkos::finalize();
+  return 0;
+}
\ No newline at end of file
diff --git a/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp b/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp
index 9909c55720..aec112b584 100644
--- a/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp
+++ b/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp
@@ -384,7 +384,9 @@ int main(int argc, char* argv[]) {
       params.use_openmp;  // Assumption is that use_openmp variable is provided
                           // as number of threads
 
-  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+  Kokkos::initialize(Kokkos::InitializationSettings()
+                         .set_num_threads(num_threads)
+                         .set_device_id(device_id));
 
   if (params.verbose) {
     Kokkos::print_configuration(std::cout);
diff --git a/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp b/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp
index 1fc1fc37d2..ce171c46bd 100644
--- a/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp
+++ b/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp
@@ -2,88 +2,96 @@
 #include "KokkosKernels_default_types.hpp"
 #include "KokkosKernels_Handle.hpp"
 #include "KokkosKernels_IOUtils.hpp"
+#include "KokkosSparse_IOUtils.hpp"
 #include "KokkosSparse_spmv.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
 #include "KokkosSparse_gauss_seidel.hpp"
 #include "KokkosBlas1_nrm2.hpp"
 
-//Parallel Gauss-Seidel Preconditioner/Smoother
+// Parallel Gauss-Seidel Preconditioner/Smoother
 //  -Uses graph coloring to find independent row sets,
 //   and applies GS to each set in parallel
 //  -Here, use to solve a diagonally dominant linear system directly.
 
-//Helper to print out colors in the shape of the grid
-int main()
-{
-  using Scalar  = default_scalar;
-  using Mag     = Kokkos::ArithTraits<Scalar>::mag_type;
-  using Ordinal = default_lno_t;
-  using Offset  = default_size_type;
+// Helper to print out colors in the shape of the grid
+int main() {
+  using Scalar    = default_scalar;
+  using Mag       = Kokkos::ArithTraits<Scalar>::mag_type;
+  using Ordinal   = default_lno_t;
+  using Offset    = default_size_type;
   using ExecSpace = Kokkos::DefaultExecutionSpace;
-  using MemSpace = typename ExecSpace::memory_space;
-  using Device  = Kokkos::Device<ExecSpace, MemSpace>;
-  using Handle  = KokkosKernels::Experimental::
-    KokkosKernelsHandle<Offset, Ordinal, default_scalar, ExecSpace, MemSpace, MemSpace>;
-  using Matrix  = KokkosSparse::CrsMatrix<Scalar, Ordinal, Device, void, Offset>;
-  using Vector  = typename Matrix::values_type;
+  using MemSpace  = typename ExecSpace::memory_space;
+  using Device    = Kokkos::Device<ExecSpace, MemSpace>;
+  using Handle    = KokkosKernels::Experimental::KokkosKernelsHandle<
+      Offset, Ordinal, default_scalar, ExecSpace, MemSpace, MemSpace>;
+  using Matrix = KokkosSparse::CrsMatrix<Scalar, Ordinal, Device, void, Offset>;
+  using Vector = typename Matrix::values_type;
   constexpr Ordinal numRows = 10000;
-  const Scalar one = Kokkos::ArithTraits<Scalar>::one();
-  const Mag magOne = Kokkos::ArithTraits<Mag>::one();
-  //Solve tolerance
+  const Scalar one          = Kokkos::ArithTraits<Scalar>::one();
+  const Mag magOne          = Kokkos::ArithTraits<Mag>::one();
+  // Solve tolerance
   const Mag tolerance = 1e-6 * magOne;
   Kokkos::initialize();
   {
-    //Generate a square, strictly diagonally dominant, but nonsymmetric matrix on which Gauss-Seidel should converge.
-    //Get approx. 20 entries per row
-    //Diagonals are 2x the absolute sum of all other entries.
+    // Generate a square, strictly diagonally dominant, but nonsymmetric matrix
+    // on which Gauss-Seidel should converge. Get approx. 20 entries per row
+    // Diagonals are 2x the absolute sum of all other entries.
     Offset nnz = numRows * 20;
-    Matrix A = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<Matrix>(numRows, numRows, nnz, 2, 100, 1.05 * one);
-    std::cout << "Generated a matrix with " << numRows << " rows/cols, and " << nnz << " entries.\n";
-    //Create a kernel handle, then a Gauss-Seidel handle with the default algorithm
+    Matrix A =
+        KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+            Matrix>(numRows, numRows, nnz, 2, 100, 1.05 * one);
+    std::cout << "Generated a matrix with " << numRows << " rows/cols, and "
+              << nnz << " entries.\n";
+    // Create a kernel handle, then a Gauss-Seidel handle with the default
+    // algorithm
     Handle handle;
     handle.create_gs_handle(KokkosSparse::GS_DEFAULT);
-    //Set up Gauss-Seidel for the graph (matrix sparsity pattern)
-    KokkosSparse::Experimental::gauss_seidel_symbolic(&handle, numRows, numRows, A.graph.row_map, A.graph.entries, false);
-    //Set up Gauss-Seidel for the matrix values (numeric)
-    //Another matrix with the same sparsity pattern could re-use the handle and symbolic phase, and only call numeric.
-    KokkosSparse::Experimental::gauss_seidel_numeric(&handle, numRows, numRows, A.graph.row_map, A.graph.entries, A.values, false);
-    //Now, preconditioner is ready to use. Set up an unknown vector (uninitialized) and randomized right-hand-side vector.
+    // Set up Gauss-Seidel for the graph (matrix sparsity pattern)
+    KokkosSparse::Experimental::gauss_seidel_symbolic(
+        &handle, numRows, numRows, A.graph.row_map, A.graph.entries, false);
+    // Set up Gauss-Seidel for the matrix values (numeric)
+    // Another matrix with the same sparsity pattern could re-use the handle and
+    // symbolic phase, and only call numeric.
+    KokkosSparse::Experimental::gauss_seidel_numeric(
+        &handle, numRows, numRows, A.graph.row_map, A.graph.entries, A.values,
+        false);
+    // Now, preconditioner is ready to use. Set up an unknown vector
+    // (uninitialized) and randomized right-hand-side vector.
     Vector x(Kokkos::view_alloc(Kokkos::WithoutInitializing, "x"), numRows);
     Vector b(Kokkos::view_alloc(Kokkos::WithoutInitializing, "b"), numRows);
     Vector res(Kokkos::view_alloc(Kokkos::WithoutInitializing, "res"), numRows);
     auto bHost = Kokkos::create_mirror_view(b);
-    for(Ordinal i = 0; i < numRows; i++)
+    for (Ordinal i = 0; i < numRows; i++)
       bHost(i) = 3 * ((one * rand()) / RAND_MAX);
     Kokkos::deep_copy(b, bHost);
-    //Measure initial residual norm ||Ax - b||, where x is 0
-    Mag initialRes = KokkosBlas::nrm2(b);
+    // Measure initial residual norm ||Ax - b||, where x is 0
+    Mag initialRes    = KokkosBlas::nrm2(b);
     Mag scaledResNorm = magOne;
-    bool firstIter = true;
-    //Iterate until reaching the tolerance
+    bool firstIter    = true;
+    // Iterate until reaching the tolerance
     int numIters = 0;
-    while(scaledResNorm > tolerance)
-    {
-      //Run one sweep of forward Gauss-Seidel (SOR with omega = 1.0)
-      //If this is the first iteration, tell apply:
+    while (scaledResNorm > tolerance) {
+      // Run one sweep of forward Gauss-Seidel (SOR with omega = 1.0)
+      // If this is the first iteration, tell apply:
       //  * to zero out x (it was uninitialized)
-      //  * that b has changed since the previous apply (since there was no previous apply)
+      //  * that b has changed since the previous apply (since there was no
+      //  previous apply)
       KokkosSparse::Experimental::forward_sweep_gauss_seidel_apply(
-          &handle, numRows, numRows,
-          A.graph.row_map, A.graph.entries, A.values,
+          &handle, numRows, numRows, A.graph.row_map, A.graph.entries, A.values,
           x, b, firstIter, firstIter, one, 1);
       firstIter = false;
-      //Now, compute the new residual norm using SPMV
+      // Now, compute the new residual norm using SPMV
       Kokkos::deep_copy(res, b);
-      //Compute res := Ax - res (since res is now equal to b, this is Ax - b)
+      // Compute res := Ax - res (since res is now equal to b, this is Ax - b)
       KokkosSparse::spmv("N", one, A, x, -one, res);
-      //Recompute the scaled norm
+      // Recompute the scaled norm
       scaledResNorm = KokkosBlas::nrm2(res) / initialRes;
       numIters++;
-      std::cout << "Iteration " << numIters << " scaled residual norm: " << scaledResNorm << '\n';
+      std::cout << "Iteration " << numIters
+                << " scaled residual norm: " << scaledResNorm << '\n';
     }
     std::cout << "SUCCESS: converged in " << numIters << " iterations.\n";
   }
   Kokkos::finalize();
   return 0;
 }
-
diff --git a/master_history.txt b/master_history.txt
index ddf9143c73..91399d7ba0 100644
--- a/master_history.txt
+++ b/master_history.txt
@@ -17,3 +17,4 @@ tag: 3.4.01     date: 05/20/2021  master: 564dccb3    release: 4c62eb86
 tag: 3.5.00     date: 11/19/2021  master: 00189c0b    release: f171533d
 tag: 3.6.00     date: 04/06/2022  master: 8381db04    release: a7e683c4
 tag: 3.6.01     date: 05/23/2022  master: e09389ae    release: e1d8de42
+tag: 3.7.00     date: 08/25/2022  master: 42ab7a29    release: 9cc88ffa
diff --git a/perf_test/batched/CMakeLists.txt b/perf_test/batched/CMakeLists.txt
index 36435ecfc1..d044cf021f 100644
--- a/perf_test/batched/CMakeLists.txt
+++ b/perf_test/batched/CMakeLists.txt
@@ -1,9 +1,9 @@
 KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 
-KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockTridiag
-  SOURCES KokkosBatched_Test_BlockTridiagDirect.cpp
-)
-KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockJacobi
-  SOURCES KokkosBatched_Test_BlockJacobi_Tutorial.cpp
-)
+#KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockTridiag
+#  SOURCES KokkosBatched_Test_BlockTridiagDirect.cpp
+#)
+#KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockJacobi
+#  SOURCES KokkosBatched_Test_BlockJacobi_Tutorial.cpp
+#)
diff --git a/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp b/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp
index f3237d9b4f..94f58fba83 100644
--- a/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp
+++ b/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp
@@ -3,16 +3,6 @@
 #include "Kokkos_Timer.hpp"
 #include "Kokkos_Random.hpp"
 
-#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-#if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION)
-#if defined(KOKKOS_ENABLE_CUDA_LAMBDA)
-#define KOKKOSBATCHED_TEST_BLOCKJACOBI
-#endif
-#endif
-#endif
-
-#if defined(KOKKOSBATCHED_TEST_BLOCKJACOBI)
-
 /// KokkosKernels headers
 #include "KokkosBatched_Util.hpp"
 
@@ -79,6 +69,152 @@ val_type computeResidual(const ManyMatrixType &A, const ManyVectorType &x,
   return residual;
 }
 
+namespace ConstructBlockJacobi {
+template <class VT>
+struct Task1Factorize {
+ private:
+  VT __A;
+
+ public:
+  Task1Factorize(VT A) : __A(A) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const member_type &member) const {
+    const int i = member.league_rank();
+    auto AA     = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL());
+    TeamLU<member_type, Algo::Level3::Unblocked>::invoke(member, AA);
+  }
+};
+
+template <class VT>
+struct Task1SetIdentity {
+ private:
+  VT __A;
+
+ public:
+  Task1SetIdentity(VT A) : __A(A) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const member_type &member) const {
+    const int i = member.league_rank();
+    auto AA     = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL());
+    TeamSetIdentity<member_type>::invoke(member, AA);
+  }
+};
+
+template <class VTA, class VTT>
+struct Task1SolveLowerTriangular {
+ private:
+  VTA __A;
+  VTT __T;
+
+ public:
+  Task1SolveLowerTriangular(VTA A, VTT T) : __A(A), __T(T) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const member_type &member) const {
+    const int i = member.league_rank();
+    const val_type one(1);
+    auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL());
+    auto TT = Kokkos::subview(__T, i, Kokkos::ALL(), Kokkos::ALL());
+    TeamTrsm<member_type, Side::Left, Uplo::Lower, Trans::NoTranspose,
+             Diag::Unit, Algo::Level3::Unblocked>::invoke(member, one, TT, AA);
+  }
+};
+
+template <class VTA, class VTT>
+struct Task1SolveUpperTriangular {
+ private:
+  VTA __A;
+  VTT __T;
+
+ public:
+  Task1SolveUpperTriangular(VTA A, VTT T) : __A(A), __T(T) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const member_type &member) const {
+    const int i = member.league_rank();
+    const val_type one(1);
+    auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL());
+    auto TT = Kokkos::subview(__T, i, Kokkos::ALL(), Kokkos::ALL());
+    TeamTrsm<member_type, Side::Left, Uplo::Upper, Trans::NoTranspose,
+             Diag::NonUnit, Algo::Level3::Unblocked>::invoke(member, one, TT,
+                                                             AA);
+  }
+};
+}  // namespace ConstructBlockJacobi
+
+template <class VTA, class VTX, class VTB>
+struct Task1ApplyBlockJacobi {
+ private:
+  VTA __A;
+  VTX __x;
+  VTB __b;
+
+ public:
+  Task1ApplyBlockJacobi(VTA A, VTX x, VTB b) : __A(A), __x(x), __b(b) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const member_type &member) const {
+    const int i = member.league_rank();
+    const val_type one(1), zero(0);
+    auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL());
+    auto xx = Kokkos::subview(__x, i, Kokkos::ALL());
+    auto bb = Kokkos::subview(__b, i, Kokkos::ALL());
+    TeamGemv<member_type, Trans::NoTranspose, Algo::Level2::Unblocked>::invoke(
+        member, one, AA, bb, zero, xx);
+  }
+};
+
+template <class VTA, class VTT>
+struct Task2FactorizeInvert {
+ private:
+  VTA __A;
+  VTT __T;
+
+ public:
+  Task2FactorizeInvert(VTA A, VTT T) : __A(A), __T(T) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const member_type &member) const {
+    const val_type one(1);
+    const int i = member.league_rank();
+    auto AA     = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL());
+    auto TT     = Kokkos::subview(__T, i, Kokkos::ALL(), Kokkos::ALL());
+
+    TeamLU<member_type, Algo::Level3::Unblocked>::invoke(member, AA);
+    TeamCopy<member_type, Trans::NoTranspose>::invoke(member, AA, TT);
+    TeamSetIdentity<member_type>::invoke(member, AA);
+    TeamTrsm<member_type, Side::Left, Uplo::Lower, Trans::NoTranspose,
+             Diag::Unit, Algo::Level3::Unblocked>::invoke(member, one, TT, AA);
+    TeamTrsm<member_type, Side::Left, Uplo::Upper, Trans::NoTranspose,
+             Diag::NonUnit, Algo::Level3::Unblocked>::invoke(member, one, TT,
+                                                             AA);
+  }
+};
+
+template <class VTA, class VTX, class VTB>
+struct Task2ApplyBlockJacobi {
+ private:
+  VTA __A;
+  VTX __x;
+  VTB __b;
+
+ public:
+  Task2ApplyBlockJacobi(VTA A, VTX x, VTB b) : __A(A), __x(x), __b(b) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const member_type &member) const {
+    const int i = member.league_rank();
+    const val_type one(1), zero(0);
+    auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL());
+    auto xx = Kokkos::subview(__x, i, Kokkos::ALL());
+    auto bb = Kokkos::subview(__b, i, Kokkos::ALL());
+    TeamGemv<member_type, Trans::NoTranspose, Algo::Level2::Unblocked>::invoke(
+        member, one, AA, bb, zero, xx);
+  }
+};
+
 int main(int argc, char *argv[]) {
   Kokkos::initialize(argc, argv);
   {
@@ -159,44 +295,21 @@ int main(int argc, char *argv[]) {
         timer.reset();
         Kokkos::parallel_for(
             "task1.factorize", policy,
-            KOKKOS_LAMBDA(const member_type &member) {
-              const int i = member.league_rank();
-              auto AA     = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
-              TeamLU<member_type, Algo::Level3::Unblocked>::invoke(member, AA);
-            });
+            ConstructBlockJacobi::Task1Factorize<decltype(A)>(A));
         Kokkos::deep_copy(T, A);
         Kokkos::parallel_for(
             "task1.set-identity", policy,
-            KOKKOS_LAMBDA(const member_type &member) {
-              const int i = member.league_rank();
-              auto AA     = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
-              TeamSetIdentity<member_type>::invoke(member, AA);
-            });
+            ConstructBlockJacobi::Task1SetIdentity<decltype(A)>(A));
         Kokkos::fence();
         Kokkos::parallel_for(
             "task1.solve-lower-triangular", policy,
-            KOKKOS_LAMBDA(const member_type &member) {
-              const int i = member.league_rank();
-              const val_type one(1);
-              auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
-              auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL());
-              TeamTrsm<member_type, Side::Left, Uplo::Lower, Trans::NoTranspose,
-                       Diag::Unit, Algo::Level3::Unblocked>::invoke(member, one,
-                                                                    TT, AA);
-            });
+            ConstructBlockJacobi::Task1SolveLowerTriangular<decltype(A),
+                                                            decltype(T)>(A, T));
         Kokkos::fence();
         Kokkos::parallel_for(
             "task1.solve-upper-triangular", policy,
-            KOKKOS_LAMBDA(const member_type &member) {
-              const int i = member.league_rank();
-              const val_type one(1);
-              auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
-              auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL());
-              TeamTrsm<member_type, Side::Left, Uplo::Upper, Trans::NoTranspose,
-                       Diag::NonUnit, Algo::Level3::Unblocked>::invoke(member,
-                                                                       one, TT,
-                                                                       AA);
-            });
+            ConstructBlockJacobi::Task1SolveUpperTriangular<decltype(A),
+                                                            decltype(T)>(A, T));
         Kokkos::fence();
         const double t = timer.seconds();
         printf(
@@ -211,16 +324,8 @@ int main(int argc, char *argv[]) {
         policy_type policy(A.extent(0), Kokkos::AUTO());
         Kokkos::parallel_for(
             "task1.apply-block-jacobi", policy,
-            KOKKOS_LAMBDA(const member_type &member) {
-              const int i = member.league_rank();
-              const val_type one(1), zero(0);
-              auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
-              auto xx = Kokkos::subview(x, i, Kokkos::ALL());
-              auto bb = Kokkos::subview(b, i, Kokkos::ALL());
-              TeamGemv<member_type, Trans::NoTranspose,
-                       Algo::Level2::Unblocked>::invoke(member, one, AA, bb,
-                                                        zero, xx);
-            });
+            Task1ApplyBlockJacobi<decltype(A), decltype(x), decltype(b)>(A, x,
+                                                                         b));
         const double t = timer.seconds();
         printf(
             "task 1: application of jacobi time = %f , # of applications per "
@@ -256,23 +361,7 @@ int main(int argc, char *argv[]) {
         timer.reset();
         Kokkos::parallel_for(
             "task2.factorize-invert", policy,
-            KOKKOS_LAMBDA(const member_type &member) {
-              const val_type one(1);
-              const int i = member.league_rank();
-              auto AA     = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
-              auto TT     = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL());
-
-              TeamLU<member_type, Algo::Level3::Unblocked>::invoke(member, AA);
-              TeamCopy<member_type, Trans::NoTranspose>::invoke(member, AA, TT);
-              TeamSetIdentity<member_type>::invoke(member, AA);
-              TeamTrsm<member_type, Side::Left, Uplo::Lower, Trans::NoTranspose,
-                       Diag::Unit, Algo::Level3::Unblocked>::invoke(member, one,
-                                                                    TT, AA);
-              TeamTrsm<member_type, Side::Left, Uplo::Upper, Trans::NoTranspose,
-                       Diag::NonUnit, Algo::Level3::Unblocked>::invoke(member,
-                                                                       one, TT,
-                                                                       AA);
-            });
+            Task2FactorizeInvert<decltype(A), decltype(T)>(A, T));
         Kokkos::fence();
         const double t = timer.seconds();
         printf(
@@ -287,16 +376,8 @@ int main(int argc, char *argv[]) {
         policy_type policy(A.extent(0), Kokkos::AUTO());
         Kokkos::parallel_for(
             "task2.apply-block-jacobi", policy,
-            KOKKOS_LAMBDA(const member_type &member) {
-              const int i = member.league_rank();
-              const val_type one(1), zero(0);
-              auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
-              auto xx = Kokkos::subview(x, i, Kokkos::ALL());
-              auto bb = Kokkos::subview(b, i, Kokkos::ALL());
-              TeamGemv<member_type, Trans::NoTranspose,
-                       Algo::Level2::Unblocked>::invoke(member, one, AA, bb,
-                                                        zero, xx);
-            });
+            Task2ApplyBlockJacobi<decltype(A), decltype(x), decltype(b)>(A, x,
+                                                                         b));
         const double t = timer.seconds();
         printf(
             "task 2: application of jacobi time = %f , # of applications per "
@@ -318,7 +399,3 @@ int main(int argc, char *argv[]) {
 
   return 0;
 }
-
-#else
-int main() { return 0; }
-#endif
diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp
index a8b3de209b..ffa6efec5e 100644
--- a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp
+++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp
@@ -3,16 +3,6 @@
 #include "Kokkos_Timer.hpp"
 #include "Kokkos_Random.hpp"
 
-#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-#if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION)
-#if defined(KOKKOS_ENABLE_CUDA_LAMBDA)
-#define KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT
-#endif
-#endif
-#endif
-
-#if defined(KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT)
-
 /// KokkosKernels headers
 #include "KokkosBatched_Util.hpp"
 #include "KokkosBatched_Vector.hpp"
@@ -47,11 +37,13 @@
 
 #define KOKKOSBATCHED_USE_128BIT_MEMORY_INST
 
-typedef Kokkos::DefaultExecutionSpace exec_space;
-typedef typename exec_space::memory_space memory_space;
-typedef Kokkos::DefaultHostExecutionSpace host_space;
+using exec_space_type   = Kokkos::DefaultExecutionSpace;
+using memory_space_type = exec_space_type::memory_space;
+using host_space_type   = Kokkos::DefaultHostExecutionSpace;
 
-typedef double value_type;
+using value_type  = double;
+using policy_type = Kokkos::TeamPolicy<exec_space_type>;
+using member_type = typename policy_type::member_type;
 
 /// 128*128*128/16*5 * (2*8) / 16
 ///
@@ -60,10 +52,10 @@ typedef double value_type;
 using namespace KokkosBatched;
 
 static constexpr int vector_length =
-    DefaultVectorLength<value_type, memory_space>::value;
+    DefaultVectorLength<value_type, memory_space_type>::value;
 #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST)
 static constexpr int internal_vector_length =
-    DefaultInternalVectorLength<value_type, memory_space>::value;
+    DefaultInternalVectorLength<value_type, memory_space_type>::value;
 #else
 static constexpr int internal_vector_length = 1;
 #endif
@@ -75,40 +67,161 @@ typedef Vector<SIMD<value_type>, internal_vector_length> internal_vector_type;
 typedef value_type internal_vector_type;
 #endif
 
-template <typename ActiveMemorySpace>
+template <typename ExecutionSpace>
 struct FactorizeModeAndAlgo;
 
-template <>
-struct FactorizeModeAndAlgo<Kokkos::HostSpace> {
+struct FactorizeModeAndAlgoHostImpl {
   typedef Mode::Serial mode_type;
   typedef Algo::Level3::Blocked algo_type;
 };
 
-#if defined(KOKKOS_ENABLE_CUDA)
+#if defined(KOKKOS_ENABLE_SERIAL)
+template <>
+struct FactorizeModeAndAlgo<Kokkos::Serial> : FactorizeModeAndAlgoHostImpl {};
+#endif
+
+#if defined(KOKKOS_ENABLE_THREADS)
+template <>
+struct FactorizeModeAndAlgo<Kokkos::Threads> : FactorizeModeAndAlgoHostImpl {};
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENMP)
 template <>
-struct FactorizeModeAndAlgo<Kokkos::CudaSpace> {
+struct FactorizeModeAndAlgo<Kokkos::OpenMP> : FactorizeModeAndAlgoHostImpl {};
+#endif
+
+struct FactorizeModeAndAlgoDeviceImpl {
   typedef Mode::Team mode_type;
   typedef Algo::Level3::Unblocked algo_type;
 };
+
+#if defined(KOKKOS_ENABLE_CUDA)
+template <>
+struct FactorizeModeAndAlgo<Kokkos::Cuda> : FactorizeModeAndAlgoDeviceImpl {};
+#endif
+
+#if defined(KOKKOS_ENABLE_HIP)
+template <>
+struct FactorizeModeAndAlgo<Kokkos::Experimental::HIP>
+    : FactorizeModeAndAlgoDeviceImpl {};
 #endif
 
-template <typename ActiveMemorySpace>
+template <typename ExecutionSpace>
 struct SolveModeAndAlgo;
 
-template <>
-struct SolveModeAndAlgo<Kokkos::HostSpace> {
+struct SolveModeAndAlgoHostImpl {
   typedef Mode::Serial mode_type;
   typedef Algo::Level2::Blocked algo_type;
 };
 
-#if defined(KOKKOS_ENABLE_CUDA)
+#if defined(KOKKOS_ENABLE_SERIAL)
 template <>
-struct SolveModeAndAlgo<Kokkos::CudaSpace> {
+struct SolveModeAndAlgo<Kokkos::Serial> : SolveModeAndAlgoHostImpl {};
+#endif
+
+#if defined(KOKKOS_ENABLE_THREADS)
+template <>
+struct SolveModeAndAlgo<Kokkos::Threads> : SolveModeAndAlgoHostImpl {};
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENMP)
+template <>
+struct SolveModeAndAlgo<Kokkos::OpenMP> : SolveModeAndAlgoHostImpl {};
+#endif
+
+struct SolveModeAndAlgoDeviceImpl {
   typedef Mode::Team mode_type;
   typedef Algo::Level2::Unblocked algo_type;
 };
+
+#if defined(KOKKOS_ENABLE_CUDA)
+template <>
+struct SolveModeAndAlgo<Kokkos::Cuda> : SolveModeAndAlgoDeviceImpl {};
+#endif
+
+#if defined(KOKKOS_ENABLE_HIP)
+template <>
+struct SolveModeAndAlgo<Kokkos::Experimental::HIP>
+    : SolveModeAndAlgoDeviceImpl {};
 #endif
 
+template <class VT>
+struct SetTridiagToIdentity {
+ private:
+  VT __AA;
+
+ public:
+  SetTridiagToIdentity(VT AA) : __AA(AA) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const member_type &member) const {
+    const int i = member.league_rank();
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(member, __AA.extent(1)), [&](const int &j) {
+          Kokkos::parallel_for(
+              Kokkos::ThreadVectorRange(member, __AA.extent(5)),
+              [&](const int &v) {
+                for (int k = 0, kend = __AA.extent(3); k < kend; ++k)
+                  __AA(i, j, 1, k, k, v) = 1;
+              });
+        });
+  }
+};
+
+template <class VT, class LT>
+struct Factorize {
+ private:
+  VT __AA;
+  LT __L;
+
+ public:
+  Factorize(VT AA, LT L) : __AA(AA), __L(L) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const member_type &member) const {
+    typedef FactorizeModeAndAlgo<Kokkos::DefaultExecutionSpace>
+        default_mode_and_algo_type;
+    typedef default_mode_and_algo_type::mode_type mode_type;
+    typedef default_mode_and_algo_type::algo_type algo_type;
+
+    const int i = member.league_rank();
+
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, __AA.extent(5)), [&](const int &v) {
+          auto AAA = Kokkos::subview(__AA, i, Kokkos::ALL(), Kokkos::ALL(),
+                                     Kokkos::ALL(), Kokkos::ALL(), v);
+
+          /// subview patterns
+          auto A = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL());
+          auto B = Kokkos::subview(AAA, 0, 2, Kokkos::ALL(), Kokkos::ALL());
+          auto C = Kokkos::subview(AAA, 0, 0, Kokkos::ALL(), Kokkos::ALL());
+          auto D = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL());
+
+          if (__L == 1) {
+            A.assign_data(&AAA(0, 1, 0, 0));
+            LU<member_type, mode_type, algo_type>::invoke(member, A);
+          } else {
+            for (int k = 0; k < (__L - 1); ++k) {
+              A.assign_data(&AAA(k, 1, 0, 0));
+              B.assign_data(&AAA(k, 2, 0, 0));
+              C.assign_data(&AAA(k, 0, 0, 0));
+              D.assign_data(&AAA(k + 1, 1, 0, 0));
+
+              LU<member_type, mode_type, algo_type>::invoke(member, A);
+              Trsm<member_type, Side::Left, Uplo::Lower, Trans::NoTranspose,
+                   Diag::Unit, mode_type, algo_type>::invoke(member, 1.0, A, B);
+              Trsm<member_type, Side::Right, Uplo::Upper, Trans::NoTranspose,
+                   Diag::NonUnit, mode_type, algo_type>::invoke(member, 1.0, A,
+                                                                C);
+              Gemm<member_type, Trans::NoTranspose, Trans::NoTranspose,
+                   mode_type, algo_type>::invoke(member, -1.0, C, B, 1.0, D);
+            }
+            LU<member_type, mode_type, algo_type>::invoke(member, D);
+          }
+        });
+  }
+};
+
 int main(int argc, char *argv[]) {
   Kokkos::initialize(argc, argv);
   {
@@ -149,53 +262,56 @@ int main(int argc, char *argv[]) {
     ///
 
     /// double 16
-    Kokkos::View<vector_type *****, Kokkos::LayoutRight, exec_space> Av(
+    Kokkos::View<vector_type *****, Kokkos::LayoutRight, exec_space_type> Av(
         "A", N / vector_length, L, 3, Blk, Blk);
 
     /// double
-    Kokkos::View<value_type ******, Kokkos::LayoutRight, exec_space> As(
+    Kokkos::View<value_type ******, Kokkos::LayoutRight, exec_space_type> As(
         (value_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2),
         Av.extent(3), Av.extent(4), vector_length);
 
     /// double 2
-    Kokkos::View<internal_vector_type ******, Kokkos::LayoutRight, exec_space>
+    Kokkos::View<internal_vector_type ******, Kokkos::LayoutRight,
+                 exec_space_type>
         Ai((internal_vector_type *)Av.data(), Av.extent(0), Av.extent(1),
            Av.extent(2), Av.extent(3), Av.extent(4),
            vector_length / internal_vector_length);
     /// double 16
-    Kokkos::View<vector_type ****, Kokkos::LayoutRight, exec_space> xv(
+    Kokkos::View<vector_type ****, Kokkos::LayoutRight, exec_space_type> xv(
         "x", N / vector_length, Nvec, L, Blk);
 
     /// double
-    Kokkos::View<value_type *****, Kokkos::LayoutRight, exec_space> xs(
+    Kokkos::View<value_type *****, Kokkos::LayoutRight, exec_space_type> xs(
         (value_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2),
         xv.extent(3), vector_length);
 
     /// double 2
-    Kokkos::View<internal_vector_type *****, Kokkos::LayoutRight, exec_space>
+    Kokkos::View<internal_vector_type *****, Kokkos::LayoutRight,
+                 exec_space_type>
         xi((internal_vector_type *)xv.data(), xv.extent(0), xv.extent(1),
            xv.extent(2), xv.extent(3), vector_length / internal_vector_length);
 
     /// double 16
-    Kokkos::View<vector_type ****, Kokkos::LayoutRight, exec_space> bv(
+    Kokkos::View<vector_type ****, Kokkos::LayoutRight, exec_space_type> bv(
         "b", N / vector_length, Nvec, L, Blk);
 
     /// double
-    Kokkos::View<value_type *****, Kokkos::LayoutRight, exec_space> bs(
+    Kokkos::View<value_type *****, Kokkos::LayoutRight, exec_space_type> bs(
         (value_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2),
         bv.extent(3), vector_length);
 
     /// double 2
-    Kokkos::View<internal_vector_type *****, Kokkos::LayoutRight, exec_space>
+    Kokkos::View<internal_vector_type *****, Kokkos::LayoutRight,
+                 exec_space_type>
         bi((internal_vector_type *)bv.data(), bv.extent(0), bv.extent(1),
            bv.extent(2), bv.extent(3), vector_length / internal_vector_length);
 
     /// double copy of A
-    Kokkos::View<value_type ******, Kokkos::LayoutRight, exec_space> Acopy(
+    Kokkos::View<value_type ******, Kokkos::LayoutRight, exec_space_type> Acopy(
         "Acopy", As.extent(0), As.extent(1), As.extent(2), As.extent(3),
         As.extent(4), As.extent(5));
 
-    Kokkos::View<value_type *****, Kokkos::LayoutRight, exec_space> rs(
+    Kokkos::View<value_type *****, Kokkos::LayoutRight, exec_space_type> rs(
         "rs", bs.extent(0), bs.extent(1), bs.extent(2), bs.extent(3),
         bs.extent(4));
 
@@ -217,24 +333,9 @@ int main(int argc, char *argv[]) {
       cudaProfilerStart();
 #endif
       timer.reset();
-      using policy_type = Kokkos::TeamPolicy<exec_space>;
-      using member_type = typename policy_type::member_type;
       policy_type policy(AA.extent(0), Kokkos::AUTO(), AA.extent(5));
-      Kokkos::parallel_for(
-          "setTridiagToIdentity", policy,
-          KOKKOS_LAMBDA(const member_type &member) {
-            const int i = member.league_rank();
-            Kokkos::parallel_for(
-                Kokkos::TeamThreadRange(member, AA.extent(1)),
-                [&](const int &j) {
-                  Kokkos::parallel_for(
-                      Kokkos::ThreadVectorRange(member, AA.extent(5)),
-                      [&](const int &v) {
-                        for (int k = 0, kend = AA.extent(3); k < kend; ++k)
-                          AA(i, j, 1, k, k, v) = 1;
-                      });
-                });
-          });
+      Kokkos::parallel_for("setTridiagToIdentity", policy,
+                           SetTridiagToIdentity<decltype(AA)>(AA));
       Kokkos::fence();
       const double t = timer.seconds();
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE)
@@ -246,7 +347,7 @@ int main(int argc, char *argv[]) {
     /// randomize input
     {
       const value_type one(1);
-      Kokkos::Random_XorShift64_Pool<exec_space> random(13245);
+      Kokkos::Random_XorShift64_Pool<exec_space_type> random(13245);
       Kokkos::fill_random(As, random, one);
       Kokkos::fill_random(bs, random, one);
 
@@ -261,9 +362,7 @@ int main(int argc, char *argv[]) {
       cudaProfilerStart();
 #endif
       timer.reset();
-      using policy_type = Kokkos::TeamPolicy<exec_space>;
-      using member_type = typename policy_type::member_type;
-      int team_size     = 0;
+      int team_size = 0;
       if (Blk < 8) {
         team_size = 32 / AA.extent(5);
       } else if (Blk < 12) {
@@ -273,59 +372,9 @@ int main(int argc, char *argv[]) {
       }
 
       policy_type policy(AA.extent(0), team_size, AA.extent(5));
-      Kokkos::parallel_for(
-          "factorize", policy.set_scratch_size(0, Kokkos::PerTeam(S)),
-          KOKKOS_LAMBDA(const member_type &member) {
-            typedef FactorizeModeAndAlgo<
-                Kokkos::Impl::ActiveExecutionMemorySpace>
-                default_mode_and_algo_type;
-            typedef default_mode_and_algo_type::mode_type mode_type;
-            typedef default_mode_and_algo_type::algo_type algo_type;
-
-            const int i = member.league_rank();
-
-            Kokkos::parallel_for(
-                Kokkos::ThreadVectorRange(member, AA.extent(5)),
-                [&](const int &v) {
-                  auto AAA =
-                      Kokkos::subview(AA, i, Kokkos::ALL(), Kokkos::ALL(),
-                                      Kokkos::ALL(), Kokkos::ALL(), v);
-
-                  /// subview patterns
-                  auto A =
-                      Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL());
-                  auto B =
-                      Kokkos::subview(AAA, 0, 2, Kokkos::ALL(), Kokkos::ALL());
-                  auto C =
-                      Kokkos::subview(AAA, 0, 0, Kokkos::ALL(), Kokkos::ALL());
-                  auto D =
-                      Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL());
-
-                  if (L == 1) {
-                    A.assign_data(&AAA(0, 1, 0, 0));
-                    LU<member_type, mode_type, algo_type>::invoke(member, A);
-                  } else {
-                    for (int k = 0; k < (L - 1); ++k) {
-                      A.assign_data(&AAA(k, 1, 0, 0));
-                      B.assign_data(&AAA(k, 2, 0, 0));
-                      C.assign_data(&AAA(k, 0, 0, 0));
-                      D.assign_data(&AAA(k + 1, 1, 0, 0));
-
-                      LU<member_type, mode_type, algo_type>::invoke(member, A);
-                      Trsm<member_type, Side::Left, Uplo::Lower,
-                           Trans::NoTranspose, Diag::Unit, mode_type,
-                           algo_type>::invoke(member, 1.0, A, B);
-                      Trsm<member_type, Side::Right, Uplo::Upper,
-                           Trans::NoTranspose, Diag::NonUnit, mode_type,
-                           algo_type>::invoke(member, 1.0, A, C);
-                      Gemm<member_type, Trans::NoTranspose, Trans::NoTranspose,
-                           mode_type, algo_type>::invoke(member, -1.0, C, B,
-                                                         1.0, D);
-                    }
-                    LU<member_type, mode_type, algo_type>::invoke(member, D);
-                  }
-                });
-          });
+      Kokkos::parallel_for("factorize",
+                           policy.set_scratch_size(0, Kokkos::PerTeam(S)),
+                           Factorize<decltype(AA), decltype(L)>(AA, L));
       Kokkos::fence();
       const double t = timer.seconds();
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE)
@@ -343,9 +392,7 @@ int main(int argc, char *argv[]) {
       cudaProfilerStart();
 #endif
       timer.reset();
-      using policy_type = Kokkos::TeamPolicy<exec_space>;
-      using member_type = typename policy_type::member_type;
-      int team_size     = 0;
+      int team_size = 0;
       if (Blk < 8) {
         team_size = 32 / AA.extent(5);
       } else if (Blk < 12) {
@@ -359,7 +406,7 @@ int main(int argc, char *argv[]) {
         Kokkos::parallel_for(
             "solve", policy.set_scratch_size(0, Kokkos::PerTeam(S)),
             KOKKOS_LAMBDA(const member_type &member) {
-              typedef SolveModeAndAlgo<Kokkos::Impl::ActiveExecutionMemorySpace>
+              typedef SolveModeAndAlgo<Kokkos::DefaultExecutionSpace>
                   default_mode_and_algo_type;
               typedef default_mode_and_algo_type::mode_type mode_type;
               typedef default_mode_and_algo_type::algo_type algo_type;
@@ -488,8 +535,6 @@ int main(int argc, char *argv[]) {
     ///
     if (1) {
       typedef KokkosBatched::Algo::Level2::Unblocked algo_type;
-      using policy_type = Kokkos::TeamPolicy<exec_space>;
-      using member_type = typename policy_type::member_type;
       policy_type policy(Acopy.extent(0), Kokkos::AUTO(), Acopy.extent(5));
       Kokkos::parallel_for(
           "compute residual", policy, KOKKOS_LAMBDA(const member_type &member) {
@@ -639,7 +684,3 @@ int main(int argc, char *argv[]) {
 
   return 0;
 }
-
-#else
-int main() { return 0; }
-#endif
diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp
index fb9cd6297d..8513cad752 100644
--- a/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp
+++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp
@@ -3,11 +3,9 @@
 #include "Kokkos_Timer.hpp"
 #include "Kokkos_Random.hpp"
 
-#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-#if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION)
+#if !(defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA))
 #define KOKKOSBATCHED_TEST_BLOCKTRIDIAGJACOBI
 #endif
-#endif
 
 #if defined(KOKKOSBATCHED_TEST_BLOCKTRIDIAGJACOBI)
 
@@ -75,38 +73,86 @@ typedef Vector<SIMD<value_type>, internal_vector_length> internal_vector_type;
 typedef value_type internal_vector_type;
 #endif
 
-template <typename ActiveMemorySpace>
+template <typename ExecutionSpace>
 struct InverseDiagonalsModeAndAlgo;
 
-template <>
-struct InverseDiagonalsModeAndAlgo<Kokkos::HostSpace> {
+struct InverseDiagonalsModeAndAlgoHostImpl {
   typedef Mode::Serial mode_type;
   typedef Algo::Level3::Blocked algo_type;
 };
 
-#if defined(KOKKOS_ENABLE_CUDA)
+#if defined(KOKKOS_ENABLE_SERIAL)
+template <>
+struct InverseDiagonalsModeAndAlgo<Kokkos::Serial>
+    : InverseDiagonalsModeAndAlgoHostImpl {};
+#endif
+
+#if defined(KOKKOS_ENABLE_THREADS)
+template <>
+struct InverseDiagonalsModeAndAlgo<Kokkos::Threads>
+    : InverseDiagonalsModeAndAlgoHostImpl {};
+#endif
+
+#if defined(KOKKOS_ENABLE_ONPENMP)
 template <>
-struct InverseDiagonalsModeAndAlgo<Kokkos::CudaSpace> {
+struct InverseDiagonalsModeAndAlgo<Kokkos::Threads>
+    : InverseDiagonalsModeAndAlgoHostImpl {};
+#endif
+
+struct InverseDiagonalsModeAndAlgoDeviceImpl {
   typedef Mode::Team mode_type;
   typedef Algo::Level3::Unblocked algo_type;
 };
+
+#if defined(KOKKOS_ENABLE_CUDA)
+template <>
+struct InverseDiagonalsModeAndAlgo<Kokkos::Cuda>
+    : InverseDiagonalsModeAndAlgoDeviceImpl {};
+#endif
+
+#if defined(KOKKOS_ENABLE_HIP)
+template <>
+struct InverseDiagonalsModeAndAlgo<Kokkos::Experimental::HIP>
+    : InverseDiagonalsModeAndAlgoDeviceImpl {};
 #endif
 
-template <typename ActiveMemorySpace>
+template <typename ExecutionSpace>
 struct SolveModeAndAlgo;
 
-template <>
-struct SolveModeAndAlgo<Kokkos::HostSpace> {
+struct SolveModeAndAlgoHostImpl {
   typedef Mode::Serial mode_type;
   typedef Algo::Level2::Blocked algo_type;
 };
 
-#if defined(KOKKOS_ENABLE_CUDA)
+#if defined(KOKKOS_ENABLE_SERIAL)
+template <>
+struct SolveModeAndAlgo<Kokkos::Serial> : SolveModeAndAlgoHostImpl {};
+#endif
+
+#if defined(KOKKOS_ENABLE_THREADS)
+template <>
+struct SolveModeAndAlgo<Kokkos::Threads> : SolveModeAndAlgoHostImpl {};
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENMP)
 template <>
-struct SolveModeAndAlgo<Kokkos::CudaSpace> {
+struct SolveModeAndAlgo<Kokkos::OpenMP> : SolveModeAndAlgoHostImpl {};
+#endif
+
+struct SolveModeAndAlgoDeviceImpl {
   typedef Mode::Team mode_type;
   typedef Algo::Level2::Unblocked algo_type;
 };
+
+#if defined(KOKKOS_ENABLE_CUDA)
+template <>
+struct SolveModeAndAlgo<Kokkos::Cuda> : SolveModeAndAlgoDeviceImpl {};
+#endif
+
+#if defined(KOKKOS_ENABLE_HIP)
+template <>
+struct SolveModeAndAlgo<Kokkos::Experimental::HIP>
+    : SolveModeAndAlgoDeviceImpl {};
 #endif
 
 int main(int argc, char *argv[]) {
@@ -282,8 +328,7 @@ int main(int argc, char *argv[]) {
           policy.set_scratch_size(
               0, Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)),
           KOKKOS_LAMBDA(const member_type &member) {
-            typedef InverseDiagonalsModeAndAlgo<
-                Kokkos::Impl::ActiveExecutionMemorySpace>
+            typedef InverseDiagonalsModeAndAlgo<Kokkos::DefaultExecutionSpace>
                 default_mode_and_algo_type;
             typedef default_mode_and_algo_type::mode_type mode_type;
             typedef default_mode_and_algo_type::algo_type algo_type;
@@ -365,8 +410,7 @@ int main(int argc, char *argv[]) {
                   0,
                   Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)),
               KOKKOS_LAMBDA(const member_type &member) {
-                typedef SolveModeAndAlgo<
-                    Kokkos::Impl::ActiveExecutionMemorySpace>
+                typedef SolveModeAndAlgo<Kokkos::DefaultExecutionSpace>
                     default_mode_and_algo_type;
                 typedef default_mode_and_algo_type::mode_type mode_type;
                 typedef default_mode_and_algo_type::algo_type algo_type;
diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp
index 49032307c4..7b353cf160 100644
--- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp
+++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp
@@ -50,6 +50,7 @@
 struct Params {
   int use_cuda    = 0;
   int use_hip     = 0;
+  int use_sycl    = 0;
   int use_openmp  = 0;
   int use_threads = 0;
   // m is vector length
@@ -63,7 +64,8 @@ void print_options() {
   std::cerr << "Options:\n" << std::endl;
 
   std::cerr << "\tBACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | "
-               "'--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'"
+               "'--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]' | "
+               "'--sycl [syclDeviceIndex]'"
             << std::endl;
   std::cerr << "\tIf no BACKEND selected, serial is the default." << std::endl;
   std::cerr << "\t[Optional] --repeat :: how many times to repeat overall "
@@ -90,6 +92,8 @@ int parse_inputs(Params& params, int argc, char** argv) {
       params.use_cuda = atoi(argv[++i]) + 1;
     } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) {
       params.use_hip = atoi(argv[++i]) + 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--sycl")) {
+      params.use_sycl = atoi(argv[++i]) + 1;
     } else if (0 == Test::string_compare_no_case(argv[i], "--m")) {
       params.m = atoi(argv[++i]);
     } else if (0 == Test::string_compare_no_case(argv[i], "--n")) {
@@ -190,17 +194,21 @@ int main(int argc, char** argv) {
   if (parse_inputs(params, argc, argv)) {
     return 1;
   }
-  const int device_id = std::max(params.use_cuda, params.use_hip) - 1;
+  const int device_id =
+      std::max(std::max(params.use_cuda, params.use_hip), params.use_sycl) - 1;
 
   const int num_threads = std::max(params.use_openmp, params.use_threads);
 
-  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+  Kokkos::initialize(Kokkos::InitializationSettings()
+                         .set_num_threads(num_threads)
+                         .set_device_id(device_id));
 
   bool useThreads = params.use_threads != 0;
   bool useOMP     = params.use_openmp != 0;
   bool useCUDA    = params.use_cuda != 0;
   bool useHIP     = params.use_hip != 0;
-  bool useSerial  = !useThreads && !useOMP && !useCUDA && !useHIP;
+  bool useSYCL    = params.use_sycl != 0;
+  bool useSerial  = !useThreads && !useOMP && !useCUDA && !useHIP && !useSYCL;
 
   if (useThreads) {
 #if defined(KOKKOS_ENABLE_THREADS)
@@ -234,6 +242,14 @@ int main(int argc, char** argv) {
 #else
     std::cout << "ERROR: HIP requested, but not available.\n";
     return 1;
+#endif
+  }
+  if (useSYCL) {
+#if defined(KOKKOS_ENABLE_SYCL)
+    run<Kokkos::Experimental::SYCL>(params.m, params.n, params.repeat);
+#else
+    std::cout << "ERROR: SYCL requested, but not available.\n";
+    return 1;
 #endif
   }
   if (useSerial) {
diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp
index 9219d34810..50840ddea6 100644
--- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp
+++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp
@@ -54,6 +54,8 @@ struct Params {
   int use_cuda    = 0;
   int use_openmp  = 0;
   int use_threads = 0;
+  int use_hip     = 0;
+  int use_sycl    = 0;
   // m is vector length
   int m      = 100000;
   int repeat = 1;
@@ -63,7 +65,8 @@ void print_options() {
   std::cerr << "Options:\n" << std::endl;
 
   std::cerr << "\tBACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | "
-               "'--cuda [cudaDeviceIndex]'"
+               "'--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]' | "
+               "'--sycl [syclDeviceIndex]'"
             << std::endl;
   std::cerr << "\tIf no BACKEND selected, serial is the default." << std::endl;
   std::cerr << "\t[Optional] --repeat :: how many times to repeat overall "
@@ -86,6 +89,10 @@ int parse_inputs(Params& params, int argc, char** argv) {
       params.use_openmp = atoi(argv[++i]);
     } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
       params.use_cuda = atoi(argv[++i]) + 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) {
+      params.use_hip = atoi(argv[++i]) + 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--sycl")) {
+      params.use_sycl = atoi(argv[++i]) + 1;
     } else if (0 == Test::string_compare_no_case(argv[i], "--m")) {
       params.m = atoi(argv[++i]);
     } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) {
@@ -184,16 +191,21 @@ int main(int argc, char** argv) {
   if (parse_inputs(params, argc, argv)) {
     return 1;
   }
-  const int device_id = params.use_cuda - 1;
+  const int device_id =
+      std::max(std::max(params.use_cuda, params.use_hip), params.use_sycl) - 1;
 
   const int num_threads = std::max(params.use_openmp, params.use_threads);
 
-  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+  Kokkos::initialize(Kokkos::InitializationSettings()
+                         .set_num_threads(num_threads)
+                         .set_device_id(device_id));
 
   bool useThreads = params.use_threads != 0;
   bool useOMP     = params.use_openmp != 0;
   bool useCUDA    = params.use_cuda != 0;
-  bool useSerial  = !useThreads && !useOMP && !useCUDA;
+  bool useHIP     = params.use_hip != 0;
+  bool useSYCL    = params.use_sycl != 0;
+  bool useSerial  = !useThreads && !useOMP && !useCUDA && !useHIP && !useSYCL;
 
   if (useThreads) {
 #if defined(KOKKOS_ENABLE_THREADS)
@@ -221,6 +233,25 @@ int main(int argc, char** argv) {
     return 1;
 #endif
   }
+
+  if (useHIP) {
+#if defined(KOKKOS_ENABLE_HIP)
+    run<Kokkos::Experimental::HIP>(params.m, params.repeat);
+#else
+    std::cout << "ERROR: HIP requested, but not available.\n";
+    return 1;
+#endif
+  }
+
+  if (useSYCL) {
+#if defined(KOKKOS_ENABLE_SYCL)
+    run<Kokkos::Experimental::SYCL>(params.m, params.repeat);
+#else
+    std::cout << "ERROR: SYCL requested, but not available.\n";
+    return 1;
+#endif
+  }
+
   if (useSerial) {
 #if defined(KOKKOS_ENABLE_SERIAL)
     run<Kokkos::Serial>(params.m, params.repeat);
diff --git a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp
index f8a2a5aa43..eeb49d6502 100644
--- a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp
+++ b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp
@@ -188,7 +188,9 @@ int main(int argc, char** argv) {
 
   const int num_threads = std::max(params.use_openmp, params.use_threads);
 
-  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+  Kokkos::initialize(Kokkos::InitializationSettings()
+                         .set_num_threads(num_threads)
+                         .set_device_id(device_id));
 
   bool useThreads = params.use_threads != 0;
   bool useOMP     = params.use_openmp != 0;
diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp
index a82ece030b..98e974229b 100644
--- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp
+++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp
@@ -180,7 +180,9 @@ int main(int argc, char** argv) {
   const int num_threads = std::max(params.use_openmp, params.use_threads);
 
   const int device_id = std::max(params.use_cuda, params.use_hip) - 1;
-  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+  Kokkos::initialize(Kokkos::InitializationSettings()
+                         .set_num_threads(num_threads)
+                         .set_device_id(device_id));
 
   // Create booleans to handle pthreads, openmp and cuda params and initialize
   // to true;
diff --git a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh
index 0b08977748..3b382a474c 100755
--- a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh
+++ b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh
@@ -18,8 +18,8 @@ function printhelp() {
   echo "--Usage--"
   echo "$0 PRECISION HOST_ARCH <ACCELERATOR_ARCH>"
   echo "  PRECISION:        Kokkos::Experimental::half_t, float, double"
-  echo "  HOST_ARCH:        POWER9, A64FX, SKX"
-  echo "  ACCELERATOR_ARCH: VOLTA70"
+  echo "  HOST_ARCH:        POWER9, A64FX, SKX, SNB, DEFAULT"
+  echo "  ACCELERATOR_ARCH: VOLTA70 AMPERE80"
   echo ""
 }
 
@@ -47,10 +47,10 @@ function beval() {
 # Handle input args
 export KOKKOS_SRC_DIR=${KOKKOS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos"}
 export KOKKOS_SRC_DIR=$(realpath $KOKKOS_SRC_DIR)
-export KOKKOS_SHA=${KOKKOS_SHA:-"b9f15a4"} # Tip of develop as of 10-14-21
+export KOKKOS_SHA=${KOKKOS_SHA:-"tags/3.6.00"}
 export KOKKOSKERNELS_SRC_DIR=${KOKKOSKERNELS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos-kernels"}
 export KOKKOSKERNELS_SRC_DIR=$(realpath $KOKKOSKERNELS_SRC_DIR)
-export KOKKOSKERNELS_SHA=${KOKKOSKERNELS_SHA:-"a2fff48"} # Tip of developer as of 10-14-21
+export KOKKOSKERNELS_SHA=${KOKKOSKERNELS_SHA:-"tags/papers/us-rse-escience-2022"}
 envprint KOKKOS_SRC_DIR KOKKOS_SHA KOKKOSKERNELS_SRC_DIR KOKKOSKERNELS_SHA
 
 dry_run="off"
@@ -82,7 +82,7 @@ elif [ "$arch_names" == "POWER9 VOLTA70" ]; then
 
   kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \
                             --arch=Power9,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \
-                            --cxxflags='-O3' --with-scalars=$precision \
+                            --cxxflags='-O3' --disable-tests --enable-examples --with-scalars=$precision \
                             --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \
                             --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \
                             tee kokkoskernels_config_cmd.out"
@@ -93,6 +93,49 @@ elif [ "$arch_names" == "POWER9 VOLTA70" ]; then
   kokkos_build_cmd="bsub -q rhel7W -W 2:00 -Is $KOKKOS_BUILD_DIR/build.sh"
   kokkoskernels_build_cmd="bsub -q rhel7W -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/build.sh"
   benchmark_cmd="bsub -q rhel7W -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/bench.sh"
+elif [ "$arch_names" == "SNB VOLTA70" ]; then
+  module purge
+  module load sems-archive-env sems-env sems-gcc/8.3.0 sems-cmake/3.19.1 cuda/11.2 sems-archive-git/2.10.1
+  kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \
+                     --arch=SNB,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \
+                     --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out"
+  kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \
+                              | tee -a kokkos_config_cmd.out"
+
+  kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \
+                            --arch=SNB,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \
+                            --cxxflags='-O3' --with-scalars=$precision \
+                            --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \
+                            --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \
+                            tee kokkoskernels_config_cmd.out"
+  kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \
+                                   -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \
+                                   $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out"
+
+  kokkos_build_cmd="$KOKKOS_BUILD_DIR/build.sh"
+  kokkoskernels_build_cmd="$KOKKOSKERNELS_BUILD_DIR/build.sh"
+  benchmark_cmd="$KOKKOSKERNELS_BUILD_DIR/bench.sh"
+elif [ "$arch_names" == "DEFAULT AMPERE80" ]; then
+  module purge
+  module load cudatoolkit/11.2 cmake/3.22.0
+
+  kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \
+                    --arch=Ampere80 --with-cuda=$CUDA_HOME --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \
+                    --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR &> kokkos_config_cmd.out"
+
+  kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR &>  kokkos_config_cmd.out"
+  kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \
+                           --arch=Ampere80 --with-cuda=$CUDA_HOME --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \
+                           --cxxflags='-O3' --with-scalars=$precision \
+                           --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \
+                           --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR &> kokkoskernels_config_cmd.out"
+
+  kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -S $KOKKOSKERNELS_SRC_DIR -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \
+                                  -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF &> kokkoskernels_config_cmd.out"
+
+  kokkos_build_cmd="$KOKKOS_BUILD_DIR/build.sh"
+  kokkoskernels_build_cmd="$KOKKOSKERNELS_BUILD_DIR/build.sh"
+  benchmark_cmd="$KOKKOSKERNELS_BUILD_DIR/bench.sh"
 elif [ "$arch_names" == "A64FX " ]; then
   export OMP_PROC_BIND=close
   export OMP_PLACES=cores
@@ -128,7 +171,7 @@ elif [ "$arch_names" == "SKX " ]; then
                        --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out"
     kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \
                                 | tee -a kokkos_config_cmd.out"
-  
+
     kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \
                               --cxxflags='-O3' --arch=SKX --with-scalars=$precision --with-openmp \
                               --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \
@@ -137,7 +180,7 @@ elif [ "$arch_names" == "SKX " ]; then
     kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \
                                      -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \
                                      $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out"
-  
+
     kokkos_build_cmd="srun --time=2:00:00 -N1 $KOKKOS_BUILD_DIR/build.sh"
     kokkoskernels_build_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/build.sh"
     benchmark_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/bench.sh"
@@ -165,7 +208,7 @@ echo "cd $benchmark_dir" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh
 echo "$KOKKOSKERNELS_BUILD_DIR/perf_test/blas/blas3/KokkosBlas3_perf_test \
       --test=batched_heuristic --routines=gemm --loop_type=parallel --batch_size_last_dim=0 \
       --matrix_size_start=2x2,2x2,2x2 --matrix_size_stop=64x64,64x64,64x64 \
-      --matrix_size_step=2 --batch_size=1024 \
+      --matrix_size_step=2 --batch_size=$((32*1024)) \
       --warm_up_loop=10 --iter=20 --verify=1 \
       ${use_simd} \
       --csv=${benchmark_dir}/${precision}_bench.csv" \
diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index e3d991c7c1..d1855573e4 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -263,9 +263,11 @@ static std::string gemm_csv_header_str =
 // Flop count formula from lapack working note 41:
 // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
 static inline double __gemm_flop_count(double a_m, double a_n, double b_n) {
+  // TODO: if not Kokkos::complex.
   if (std::is_same<double, default_scalar>::value ||
       std::is_same<float, default_scalar>::value ||
-      std::is_same<Kokkos::Experimental::half_t, default_scalar>::value)
+      std::is_same<Kokkos::Experimental::half_t, default_scalar>::value ||
+      std::is_same<Kokkos::Experimental::bhalf_t, default_scalar>::value)
     return 2 * a_m * b_n * a_n;
   else
     // For complex, we need to count 2 flops for each add and 6 flops for each
@@ -1574,8 +1576,8 @@ static inline bool __gemm_print_compare_failure(ViewType h_expected,
                                                 ViewType h_actual, int i, int j,
                                                 int k, double epsilon) {
   STATUS;
-  auto diff = static_cast<double>(Kokkos::Experimental::fabs(
-      static_cast<double>(h_expected(i, j, k) - h_actual(i, j, k))));
+  auto diff =
+      std::fabs(static_cast<double>(h_expected(i, j, k) - h_actual(i, j, k)));
 
   if (diff > epsilon) {
     printf(
@@ -1775,6 +1777,11 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src,
   Kokkos::deep_copy(dst, h_dst);
   Kokkos::fence();
 #else
+  // Avoid unused parameter warnings:
+  (void)src;
+  (void)dst;
+  (void)options;
+
   Kokkos::abort(
       "Cannot perform simd verification with cuda/10.2.2, rerun with -v 0");
 #endif  // #if (CUDA_VERSION != 10020)
@@ -1883,7 +1890,7 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args,
 
   // Check the result
   if (gemm_args.C.data() != nullptr) {
-#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL)
+#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058
     if (options.test == EXPERIMENT) {
       using view_type_2d =
           Kokkos::View<default_scalar **, Kokkos::LayoutStride, default_device>;
@@ -1908,7 +1915,7 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args,
         }
       }
     }
-#endif  // KOKKOSKERNELS_ENABLE_TPL_ARMPL
+#endif  // KOKKOSKERNELS_ENABLE_TPL_ARMPL && ARMPL_BUILD >= 1058
     if (__gemm_do_compare<ScalarType, LayoutType>(C_expected, gemm_args.C))
       FATAL_ERROR("Result value mismatch!");
   }
@@ -2078,7 +2085,7 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) {
     Kokkos::fence();
   }
 
-#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL)
+#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058
   if (options.test == EXPERIMENT) {
     armpl_int_t bstrd_A, istrd_A, jstrd_A, bstrd_B, istrd_B, jstrd_B, bstrd_C,
         istrd_C, jstrd_C;
@@ -2168,7 +2175,7 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) {
     gemm_args.B_pl.mat = B_p;
     gemm_args.C_pl.mat = C_p;
   }
-#endif  // KOKKOSKERNELS_ENABLE_TPL_ARMPL
+#endif  // KOKKOSKERNELS_ENABLE_TPL_ARMPL && ARMPL_BUILD >= 1058
 
   gemm_args.alpha         = options.blas_args.gemm.alpha;
   gemm_args.beta          = options.blas_args.gemm.beta;
diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp
index 595292ebd7..6497db8de3 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp
@@ -158,7 +158,9 @@ int main(int argc, char** argv) {
                           // as number of threads
   const int device_id = params.use_cuda - 1;
 
-  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+  Kokkos::initialize(Kokkos::InitializationSettings()
+                         .set_num_threads(num_threads)
+                         .set_device_id(device_id));
 
   bool useOMP  = params.use_openmp != 0;
   bool useCUDA = params.use_cuda != 0;
diff --git a/perf_test/blas/blas3/papers/kokkos-half-t-rse-escience-2022/reproducer.md b/perf_test/blas/blas3/papers/kokkos-half-t-rse-escience-2022/reproducer.md
new file mode 100644
index 0000000000..e558abbff6
--- /dev/null
+++ b/perf_test/blas/blas3/papers/kokkos-half-t-rse-escience-2022/reproducer.md
@@ -0,0 +1,26 @@
+## To reproduce the half precision results for batched-GEMM:
+```bash
+git clone https://github.com/kokkos/kokkos.git
+git clone https://github.com/kokkos/kokkos-kernels.git
+cd kokkos-kernels
+git checkout tags/papers/us-rse-escience-2022
+cd perf_test/blas/blas3
+export KOKKOS_SRC_DIR=/path/to/kokkos
+export KOKKOSKERNELS_SRC_DIR=/path/to/kokkos-kernels
+```
+
+### On V100
+```bash
+./KokkosBatched_BatchedGemm_benchmark.sh double SNB VOLTA70
+./KokkosBatched_BatchedGemm_benchmark.sh float SNB VOLTA70
+./KokkosBatched_BatchedGemm_benchmark.sh half SNB VOLTA70
+./KokkosBatched_BatchedGemm_benchmark.sh bhalf SNB VOLTA70
+```
+
+### On A100
+```bash
+./KokkosBatched_BatchedGemm_benchmark.sh double DEFAULT AMPERE80
+./KokkosBatched_BatchedGemm_benchmark.sh float DEFAULT AMPERE80
+./KokkosBatched_BatchedGemm_benchmark.sh half DEFAULT AMPERE80
+./KokkosBatched_BatchedGemm_benchmark.sh bhalf DEFAULT AMPERE80
+```
\ No newline at end of file
diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp
index 8b16111157..8a97d77a38 100644
--- a/perf_test/graph/KokkosGraph_color.cpp
+++ b/perf_test/graph/KokkosGraph_color.cpp
@@ -55,6 +55,7 @@
 #include "KokkosKernels_TestParameters.hpp"
 #include "KokkosGraph_Distance1Color.hpp"
 #include "KokkosKernels_TestUtils.hpp"
+#include "KokkosSparse_IOUtils.hpp"
 
 void print_options(std::ostream &os, const char *app_name,
                    unsigned int indent = 0) {
@@ -376,16 +377,14 @@ void run_multi_mem_experiment(Parameters params) {
   if (params.a_mem_space == 1) {
     fast_crstmat_t a_fast_crsmat;
     a_fast_crsmat =
-        KokkosKernels::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(
-            a_mat_file);
+        KokkosSparse::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(a_mat_file);
     a_fast_crsgraph = a_fast_crsmat.graph;
     num_cols        = a_fast_crsmat.numCols();
 
   } else {
     slow_crstmat_t a_slow_crsmat;
     a_slow_crsmat =
-        KokkosKernels::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(
-            a_mat_file);
+        KokkosSparse::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(a_mat_file);
     a_slow_crsgraph = a_slow_crsmat.graph;
     num_cols        = a_slow_crsmat.numCols();
   }
@@ -537,7 +536,9 @@ int main(int argc, char **argv) {
       params.use_openmp;  // Assumption is that use_openmp variable is provided
                           // as number of threads
   const int device_id = std::max(params.use_cuda, params.use_hip) - 1;
-  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+  Kokkos::initialize(Kokkos::InitializationSettings()
+                         .set_num_threads(num_threads)
+                         .set_device_id(device_id));
   Kokkos::print_configuration(std::cout);
 
 #if defined(KOKKOS_ENABLE_OPENMP)
diff --git a/perf_test/graph/KokkosGraph_color_d2.cpp b/perf_test/graph/KokkosGraph_color_d2.cpp
index 7d6f45889a..b824ced38a 100644
--- a/perf_test/graph/KokkosGraph_color_d2.cpp
+++ b/perf_test/graph/KokkosGraph_color_d2.cpp
@@ -65,6 +65,7 @@
 #include <KokkosGraph_Distance2Color.hpp>
 #include "KokkosKernels_default_types.hpp"
 #include "KokkosKernels_TestUtils.hpp"
+#include "KokkosSparse_IOUtils.hpp"
 
 using namespace KokkosGraph;
 
@@ -595,7 +596,7 @@ void experiment_driver(const D2Parameters& params) {
   using graph_t  = typename crsMat_t::StaticCrsGraphType;
 
   crsMat_t A =
-      KokkosKernels::Impl::read_kokkos_crst_matrix<crsMat_t>(params.mtx_file);
+      KokkosSparse::Impl::read_kokkos_crst_matrix<crsMat_t>(params.mtx_file);
   graph_t Agraph = A.graph;
   int num_cols   = A.numCols();
 
@@ -631,7 +632,9 @@ int main(int argc, char* argv[]) {
     device_id = params.use_cuda - 1;
   else if (params.use_hip)
     device_id = params.use_hip - 1;
-  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+  Kokkos::initialize(Kokkos::InitializationSettings()
+                         .set_num_threads(num_threads)
+                         .set_device_id(device_id));
 
   // Print out verbose information about the configuration of the run.
   // Kokkos::print_configuration(std::cout);
diff --git a/perf_test/graph/KokkosGraph_mis_d2.cpp b/perf_test/graph/KokkosGraph_mis_d2.cpp
index c68d5f85e2..df5e28b315 100644
--- a/perf_test/graph/KokkosGraph_mis_d2.cpp
+++ b/perf_test/graph/KokkosGraph_mis_d2.cpp
@@ -66,6 +66,7 @@
 #include "KokkosGraph_MIS2.hpp"
 #include "KokkosKernels_default_types.hpp"
 #include "KokkosKernels_TestUtils.hpp"
+#include "KokkosSparse_IOUtils.hpp"
 
 using namespace KokkosGraph;
 
@@ -253,11 +254,11 @@ void run_mis2(const MIS2Parameters& params) {
 
   Kokkos::Timer t;
   crsMat_t A_in =
-      KokkosKernels::Impl::read_kokkos_crst_matrix<crsMat_t>(params.mtx_file);
+      KokkosSparse::Impl::read_kokkos_crst_matrix<crsMat_t>(params.mtx_file);
   std::cout << "I/O time: " << t.seconds() << " s\n";
   t.reset();
   // Symmetrize the matrix just in case
-  crsMat_t At_in = KokkosKernels::Impl::transpose_matrix(A_in);
+  crsMat_t At_in = KokkosSparse::Impl::transpose_matrix(A_in);
   crsMat_t A;
   KKH kkh;
   const default_scalar one = Kokkos::ArithTraits<default_scalar>::one();
diff --git a/perf_test/graph/KokkosGraph_run_triangle.hpp b/perf_test/graph/KokkosGraph_run_triangle.hpp
index 2fee139a64..0a189cd3e1 100644
--- a/perf_test/graph/KokkosGraph_run_triangle.hpp
+++ b/perf_test/graph/KokkosGraph_run_triangle.hpp
@@ -117,9 +117,7 @@ struct Flush {
   void init(value_type &update) { update = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type &update, const volatile value_type &input) {
-    update += input;
-  }
+  void join(value_type &update, const value_type &input) { update += input; }
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const int i, value_type &update) const { update += _buf[i]; }
diff --git a/perf_test/graph/KokkosGraph_triangle.cpp b/perf_test/graph/KokkosGraph_triangle.cpp
index 17e4a08de4..be0b57492a 100644
--- a/perf_test/graph/KokkosGraph_triangle.cpp
+++ b/perf_test/graph/KokkosGraph_triangle.cpp
@@ -296,12 +296,14 @@ int main(int argc, char **argv) {
       params.use_openmp;  // Assumption is that use_openmp variable is provided
                           // as number of threads
   const int device_id = 0;
-  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+  Kokkos::initialize(Kokkos::InitializationSettings()
+                         .set_num_threads(num_threads)
+                         .set_device_id(device_id));
 
 #if defined(KOKKOS_ENABLE_OPENMP)
 
   if (params.use_openmp) {
-    Kokkos::OpenMP::print_configuration(std::cout);
+    Kokkos::OpenMP().print_configuration(std::cout);
 #ifdef KOKKOSKERNELS_MULTI_MEM
     KokkosKernels::Experiment::run_multi_mem_triangle<
         size_type, idx, Kokkos::OpenMP, Kokkos::OpenMP::memory_space,
@@ -317,7 +319,7 @@ int main(int argc, char **argv) {
 
 #if defined(KOKKOS_ENABLE_CUDA)
   if (params.use_cuda) {
-    Kokkos::Cuda::print_configuration(std::cout);
+    Kokkos::Cuda().print_configuration(std::cout);
 #ifdef KOKKOSKERNELS_MULTI_MEM
     KokkosKernels::Experiment::run_multi_mem_triangle<
         size_type, idx, Kokkos::Cuda, Kokkos::Cuda::memory_space,
@@ -333,7 +335,7 @@ int main(int argc, char **argv) {
 
 #if defined(KOKKOS_ENABLE_HIP)
   if (params.use_hip) {
-    Kokkos::Experimental::HIP::print_configuration(std::cout);
+    Kokkos::Experimental::HIP().print_configuration(std::cout);
     KokkosKernels::Experiment::run_multi_mem_triangle<
         size_type, idx, Kokkos::Experimental::HIP,
         Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(params);
diff --git a/perf_test/sparse/KokkosSparse_block_pcg.cpp b/perf_test/sparse/KokkosSparse_block_pcg.cpp
index 89ab0bfdca..3628eac956 100644
--- a/perf_test/sparse/KokkosSparse_block_pcg.cpp
+++ b/perf_test/sparse/KokkosSparse_block_pcg.cpp
@@ -50,7 +50,7 @@
 #include "KokkosSparse_pcg.hpp"
 
 #include "KokkosKernels_Utils.hpp"
-#include "KokkosKernels_IOUtils.hpp"
+#include "KokkosSparse_IOUtils.hpp"
 
 #include "KokkosKernels_TestUtils.hpp"
 
@@ -75,7 +75,7 @@ crsMat_t create_crs_matrix(char *mtx_bin_file) {
 
   if (std::string(mtx_bin_file) == "auto") {
     INDEX_TYPE num_rows = 11, num_cols = 11, nnz = 40;
-    crsmat = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+    crsmat = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix<
         crsMat_t>(num_rows, num_cols, nnz, 3, 5);
     printf("generating test matrix automatically\n");
     printf("   num rows:      %d", num_rows);
@@ -86,7 +86,7 @@ crsMat_t create_crs_matrix(char *mtx_bin_file) {
     INDEX_TYPE *xadj, *adj;
     SCALAR_TYPE *ew;
 
-    KokkosKernels::Impl::read_matrix<INDEX_TYPE, INDEX_TYPE, SCALAR_TYPE>(
+    KokkosSparse::Impl::read_matrix<INDEX_TYPE, INDEX_TYPE, SCALAR_TYPE>(
         &nv, &ne, &xadj, &adj, &ew, mtx_bin_file);
 
     row_map_view_t rowmap_view("rowmap_view", nv + 1);
@@ -322,7 +322,7 @@ void run_experiment(
   // typedef typename lno_nnz_view_t::value_type lno_t;
   // typedef typename lno_view_t::value_type size_type;
   // typedef typename scalar_view_t::value_type scalar_t;
-  KokkosKernels::Impl::kk_create_blockcrs_formated_point_crsmatrix(
+  KokkosSparse::Impl::kk_create_blockcrs_formatted_point_crsmatrix(
       block_size, crsmat.numRows(), crsmat.numCols(), crsmat.graph.row_map,
       crsmat.graph.entries, crsmat.values, out_r, out_c, pf_rm, pf_e, pf_v);
 
@@ -349,7 +349,7 @@ void run_experiment(
   scalar_view_t bf_v;
   size_t but_r, but_c;
 
-  KokkosKernels::Impl::kk_create_blockcrs_from_blockcrs_formatted_point_crs(
+  KokkosSparse::Impl::kk_create_blockcrs_from_blockcrs_formatted_point_crs(
       block_size, out_r, out_c, pf_rm, pf_e, pf_v, but_r, but_c, bf_rm, bf_e,
       bf_v);
 
@@ -381,7 +381,7 @@ int main(int argc, char **argv) {
   int cmdline[CMD_COUNT];
   char *mtx_bin_file = NULL;
   int block_size     = 5;
-  struct Kokkos::InitArguments kargs;
+  Kokkos::InitializationSettings kargs;
 
   for (int i = 0; i < CMD_COUNT; ++i) cmdline[i] = 0;
 
@@ -389,9 +389,11 @@ int main(int argc, char **argv) {
     if (0 == Test::string_compare_no_case(argv[i], "--serial")) {
       cmdline[CMD_USE_SERIAL] = 1;
     } else if (0 == Test::string_compare_no_case(argv[i], "--threads")) {
-      kargs.num_threads = cmdline[CMD_USE_THREADS] = atoi(argv[++i]);
+      cmdline[CMD_USE_THREADS] = atoi(argv[++i]);
+      kargs.set_num_threads(cmdline[CMD_USE_THREADS]);
     } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) {
-      kargs.num_threads = cmdline[CMD_USE_OPENMP] = atoi(argv[++i]);
+      cmdline[CMD_USE_OPENMP] = atoi(argv[++i]);
+      kargs.set_num_threads(cmdline[CMD_USE_OPENMP]);
     } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
       cmdline[CMD_USE_CUDA] = 1;
     } else if (0 == Test::string_compare_no_case(argv[i], "--mtx")) {
@@ -435,7 +437,7 @@ int main(int argc, char **argv) {
 
   if (cmdline[CMD_USE_SERIAL]) {
     using myExecSpace = Kokkos::Serial;
-    Kokkos::Serial::print_configuration(std::cout);
+    myExecSpace().print_configuration(std::cout);
 
     using crsMat_t =
         typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, myExecSpace,
@@ -458,7 +460,7 @@ int main(int argc, char **argv) {
 
   if (cmdline[CMD_USE_THREADS]) {
     using myExecSpace = Kokkos::Threads;
-    Kokkos::Threads::print_configuration(std::cout);
+    myExecSpace().print_configuration(std::cout);
 
     using crsMat_t =
         typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, myExecSpace,
@@ -481,7 +483,7 @@ int main(int argc, char **argv) {
 
   if (cmdline[CMD_USE_OPENMP]) {
     using myExecSpace = Kokkos::OpenMP;
-    Kokkos::OpenMP::print_configuration(std::cout);
+    myExecSpace().print_configuration(std::cout);
 
     using crsMat_t =
         typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, myExecSpace,
@@ -504,7 +506,7 @@ int main(int argc, char **argv) {
   if (cmdline[CMD_USE_CUDA]) {
     // Use the last device:
     using myExecSpace = Kokkos::Cuda;
-    Kokkos::Cuda::print_configuration(std::cout);
+    myExecSpace().print_configuration(std::cout);
 
     using crsMat_t =
         typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, myExecSpace,
diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp
index 3d2be67676..2136cbb640 100644
--- a/perf_test/sparse/KokkosSparse_gs.cpp
+++ b/perf_test/sparse/KokkosSparse_gs.cpp
@@ -52,6 +52,7 @@
 #include <KokkosBlas1_nrm2.hpp>
 #include <KokkosKernels_config.h>
 #include "KokkosKernels_default_types.hpp"
+#include "KokkosSparse_IOUtils.hpp"
 #include <iostream>
 #include <random>
 #include <vector>
@@ -177,7 +178,7 @@ crsMat_t generateLongRowMatrix(const GS_Parameters& params) {
                                     rowmap.data(), numRows + 1));
   crsMat_t A("A", numRows, numRows, totalEntries, valuesView, rowmapView,
              entriesView);
-  A = KokkosKernels::sort_and_merge_matrix(A);
+  A = KokkosSparse::sort_and_merge_matrix(A);
   if (params.graph_symmetric) {
     // Symmetrize on host, rather than relying on the parallel versions (those
     // can be tested for symmetric=false)
@@ -203,7 +204,7 @@ void runGS(const GS_Parameters& params) {
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
   crsMat_t A;
   if (params.matrix_path)
-    A = KokkosKernels::Impl::read_kokkos_crst_matrix<crsMat_t>(
+    A = KokkosSparse::Impl::read_kokkos_crst_matrix<crsMat_t>(
         params.matrix_path);
   else
     A = generateLongRowMatrix<crsMat_t>(params);
diff --git a/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/perf_test/sparse/KokkosSparse_kk_spmv.cpp
index 953294b120..40887d67ec 100644
--- a/perf_test/sparse/KokkosSparse_kk_spmv.cpp
+++ b/perf_test/sparse/KokkosSparse_kk_spmv.cpp
@@ -55,6 +55,7 @@
 #include <Kokkos_Core.hpp>
 #include <KokkosSparse_CrsMatrix.hpp>
 #include <KokkosKernels_IOUtils.hpp>
+#include <KokkosSparse_IOUtils.hpp>
 #include <KokkosSparse_spmv.hpp>
 #include "KokkosKernels_default_types.hpp"
 
@@ -74,11 +75,11 @@ void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop,
   srand(17312837);
   matrix_type A;
   if (filename)
-    A = KokkosKernels::Impl::read_kokkos_crst_matrix<matrix_type>(filename);
+    A = KokkosSparse::Impl::read_kokkos_crst_matrix<matrix_type>(filename);
   else {
     Offset nnz = 10 * numRows;
     // note: the help text says the bandwidth is fixed at 0.01 * numRows
-    A = KokkosKernels::Impl::kk_generate_sparse_matrix<matrix_type>(
+    A = KokkosSparse::Impl::kk_generate_sparse_matrix<matrix_type>(
         numRows, numCols, nnz, 0, 0.01 * numRows);
   }
   numRows = A.numRows();
diff --git a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp b/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp
index 371f1b1d33..d7ae6da430 100644
--- a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp
+++ b/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp
@@ -44,6 +44,7 @@
 
 #include "KokkosSparse_CrsMatrix.hpp"
 #include "KokkosSparse_run_spgemm.hpp"
+#include "KokkosSparse_IOUtils.hpp"
 
 namespace KokkosKernels {
 
@@ -74,12 +75,10 @@ void run_multi_mem_spgemm(Parameters params) {
 
   if (params.a_mem_space == 1) {
     a_fast_crsmat =
-        KokkosKernels::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(
-            a_mat_file);
+        KokkosSparse::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(a_mat_file);
   } else {
     a_slow_crsmat =
-        KokkosKernels::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(
-            a_mat_file);
+        KokkosSparse::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(a_mat_file);
   }
 
   if ((b_mat_file == NULL || strcmp(b_mat_file, a_mat_file) == 0) &&
@@ -90,13 +89,11 @@ void run_multi_mem_spgemm(Parameters params) {
   } else if (params.b_mem_space == 1) {
     if (b_mat_file == NULL) b_mat_file = a_mat_file;
     b_fast_crsmat =
-        KokkosKernels::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(
-            b_mat_file);
+        KokkosSparse::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(b_mat_file);
   } else {
     if (b_mat_file == NULL) b_mat_file = a_mat_file;
     b_slow_crsmat =
-        KokkosKernels::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(
-            b_mat_file);
+        KokkosSparse::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(b_mat_file);
   }
 
   if (params.a_mem_space == 1) {
@@ -222,18 +219,18 @@ void run_multi_mem_spgemm(Parameters params) {
 
   if (c_mat_file != NULL) {
     if (params.c_mem_space == 1) {
-      KokkosKernels::sort_crs_matrix(c_fast_crsmat);
+      KokkosSparse::sort_crs_matrix(c_fast_crsmat);
 
-      KokkosKernels::Impl::write_graph_bin(
+      KokkosSparse::Impl::write_graph_bin(
           (lno_t)(c_fast_crsmat.numRows()),
           (size_type)(c_fast_crsmat.graph.entries.extent(0)),
           c_fast_crsmat.graph.row_map.data(),
           c_fast_crsmat.graph.entries.data(), c_fast_crsmat.values.data(),
           c_mat_file);
     } else {
-      KokkosKernels::sort_crs_matrix(c_slow_crsmat);
+      KokkosSparse::sort_crs_matrix(c_slow_crsmat);
 
-      KokkosKernels::Impl::write_graph_bin(
+      KokkosSparse::Impl::write_graph_bin(
           (lno_t)c_slow_crsmat.numRows(),
           (size_type)c_slow_crsmat.graph.entries.extent(0),
           c_slow_crsmat.graph.row_map.data(),
diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp
index 5f34ec1cd9..51c2cbb01b 100644
--- a/perf_test/sparse/KokkosSparse_pcg.cpp
+++ b/perf_test/sparse/KokkosSparse_pcg.cpp
@@ -49,6 +49,7 @@
 #include "KokkosKernels_IOUtils.hpp"
 #include "KokkosKernels_default_types.hpp"
 #include "KokkosKernels_TestUtils.hpp"
+#include "KokkosSparse_IOUtils.hpp"
 #include <iostream>
 
 #define MAXVAL 1
@@ -263,9 +264,8 @@ void run_pcg(int *cmdline, const char *mtx_file) {
   default_lno_t *xadj, *adj;
   default_scalar *ew;
 
-  KokkosKernels::Impl::read_matrix<default_lno_t, default_lno_t,
-                                   default_scalar>(&nv, &ne, &xadj, &adj, &ew,
-                                                   mtx_file);
+  KokkosSparse::Impl::read_matrix<default_lno_t, default_lno_t, default_scalar>(
+      &nv, &ne, &xadj, &adj, &ew, mtx_file);
 
   typedef
       typename KokkosSparse::CrsMatrix<default_scalar, default_lno_t,
@@ -370,17 +370,16 @@ int main(int argc, char **argv) {
     return 0;
   }
 
-  Kokkos::InitArguments init_args;  // Construct with default args, change
-                                    // members based on exec space
+  // Construct with default args, change members based on exec space
+  Kokkos::InitializationSettings init_args;
 
-  init_args.device_id = cmdline[CMD_DEVICE];
+  init_args.set_device_id(cmdline[CMD_DEVICE]);
+  init_args.set_num_threads(
+      std::max(cmdline[CMD_USE_THREADS], cmdline[CMD_USE_OPENMP]));
   if (cmdline[CMD_USE_NUMA] && cmdline[CMD_USE_CORE_PER_NUMA]) {
-    init_args.num_threads =
-        std::max(cmdline[CMD_USE_THREADS], cmdline[CMD_USE_OPENMP]);
-    init_args.num_numa = cmdline[CMD_USE_NUMA];
-  } else {
-    init_args.num_threads =
-        std::max(cmdline[CMD_USE_THREADS], cmdline[CMD_USE_OPENMP]);
+    KokkosKernels::Impl::throw_runtime_exception(
+        "NUMA init arg is no longer supported by Kokkos");
+    // init_args.num_numa = cmdline[CMD_USE_NUMA];
   }
 
   Kokkos::initialize(init_args);
diff --git a/perf_test/sparse/KokkosSparse_run_spgemm.hpp b/perf_test/sparse/KokkosSparse_run_spgemm.hpp
index caedb013c3..5ece07e403 100644
--- a/perf_test/sparse/KokkosSparse_run_spgemm.hpp
+++ b/perf_test/sparse/KokkosSparse_run_spgemm.hpp
@@ -44,7 +44,7 @@
 
 #include "KokkosSparse_spgemm.hpp"
 #include "KokkosKernels_TestParameters.hpp"
-#include "KokkosKernels_Sorting.hpp"
+#include "KokkosSparse_SortCrs.hpp"
 
 #define TRANPOSEFIRST false
 #define TRANPOSESECOND false
@@ -67,7 +67,7 @@ bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2) {
   size_t nentries2 = output_mat2.graph.entries.extent(0);
   size_t nvals2    = output_mat2.values.extent(0);
 
-  KokkosKernels::sort_crs_matrix(output_mat1);
+  KokkosSparse::sort_crs_matrix(output_mat1);
 
   if (nrows1 != nrows2) {
     std::cerr << "row count is different" << std::endl;
@@ -82,7 +82,7 @@ bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2) {
     return false;
   }
 
-  KokkosKernels::sort_crs_matrix(output_mat2);
+  KokkosSparse::sort_crs_matrix(output_mat2);
 
   bool is_identical = true;
   is_identical      = KokkosKernels::Impl::kk_is_identical_view<
diff --git a/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp b/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp
index b5ac32a86e..8efd849f58 100644
--- a/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp
+++ b/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp
@@ -45,7 +45,8 @@
 #include "KokkosSparse_CrsMatrix.hpp"
 #include "KokkosKernels_TestParameters.hpp"
 #include "KokkosSparse_spgemm.hpp"
-#include "KokkosKernels_Sorting.hpp"
+#include "KokkosSparse_SortCrs.hpp"
+#include "KokkosSparse_IOUtils.hpp"
 
 #define TRANSPOSEFIRST false
 #define TRANSPOSESECOND false
@@ -69,7 +70,7 @@ bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2) {
   size_t nentries2 = output_mat2.graph.entries.extent(0);
   size_t nvals2    = output_mat2.values.extent(0);
 
-  KokkosKernels::sort_crs_matrix(output_mat1);
+  KokkosSparse::sort_crs_matrix(output_mat1);
 
   if (nrows1 != nrows2) {
     std::cerr << "row count is different" << std::endl;
@@ -84,7 +85,7 @@ bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2) {
     return false;
   }
 
-  KokkosKernels::sort_crs_matrix(output_mat2);
+  KokkosSparse::sort_crs_matrix(output_mat2);
 
   bool is_identical = true;
   is_identical      = KokkosKernels::Impl::kk_is_identical_view<
@@ -337,12 +338,10 @@ void run_spgemm_jacobi(Parameters params) {
 
   if (params.a_mem_space == 1) {
     a_fast_crsmat =
-        KokkosKernels::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(
-            a_mat_file);
+        KokkosSparse::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(a_mat_file);
   } else {
     a_slow_crsmat =
-        KokkosKernels::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(
-            a_mat_file);
+        KokkosSparse::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(a_mat_file);
   }
 
   if ((b_mat_file == NULL || strcmp(b_mat_file, a_mat_file) == 0) &&
@@ -353,13 +352,11 @@ void run_spgemm_jacobi(Parameters params) {
   } else if (params.b_mem_space == 1) {
     if (b_mat_file == NULL) b_mat_file = a_mat_file;
     b_fast_crsmat =
-        KokkosKernels::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(
-            b_mat_file);
+        KokkosSparse::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(b_mat_file);
   } else {
     if (b_mat_file == NULL) b_mat_file = a_mat_file;
     b_slow_crsmat =
-        KokkosKernels::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(
-            b_mat_file);
+        KokkosSparse::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(b_mat_file);
   }
 
   if (params.a_mem_space == 1) {
@@ -485,18 +482,18 @@ void run_spgemm_jacobi(Parameters params) {
 
   if (c_mat_file != NULL) {
     if (params.c_mem_space == 1) {
-      KokkosKernels::sort_crs_matrix(c_fast_crsmat);
+      KokkosSparse::sort_crs_matrix(c_fast_crsmat);
 
-      KokkosKernels::Impl::write_graph_bin(
+      KokkosSparse::Impl::write_graph_bin(
           (lno_t)(c_fast_crsmat.numRows()),
           (size_type)(c_fast_crsmat.graph.entries.extent(0)),
           c_fast_crsmat.graph.row_map.data(),
           c_fast_crsmat.graph.entries.data(), c_fast_crsmat.values.data(),
           c_mat_file);
     } else {
-      KokkosKernels::sort_crs_matrix(c_slow_crsmat);
+      KokkosSparse::sort_crs_matrix(c_slow_crsmat);
 
-      KokkosKernels::Impl::write_graph_bin(
+      KokkosSparse::Impl::write_graph_bin(
           (lno_t)c_slow_crsmat.numRows(),
           (size_type)c_slow_crsmat.graph.entries.extent(0),
           c_slow_crsmat.graph.row_map.data(),
diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp
index 7b0bd42d2a..5448843168 100644
--- a/perf_test/sparse/KokkosSparse_spadd.cpp
+++ b/perf_test/sparse/KokkosSparse_spadd.cpp
@@ -45,8 +45,9 @@
 #include <iostream>
 #include "KokkosKernels_config.h"
 #include "KokkosKernels_Handle.hpp"
-#include "KokkosKernels_IOUtils.hpp"
-#include "KokkosKernels_SparseUtils_cusparse.hpp"
+#include "KokkosSparse_IOUtils.hpp"
+#include "KokkosSparse_Utils_cusparse.hpp"
+#include "KokkosSparse_Utils_mkl.hpp"
 #include "KokkosSparse_spadd.hpp"
 #include "KokkosKernels_TestUtils.hpp"
 
@@ -57,21 +58,6 @@
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
 #include <mkl.h>
 #include <mkl_spblas.h>
-
-inline void spadd_mkl_internal_safe_call(sparse_status_t mklStatus,
-                                         const char* name,
-                                         const char* file = nullptr,
-                                         const int line   = 0) {
-  if (SPARSE_STATUS_SUCCESS != mklStatus) {
-    std::ostringstream oss;
-    oss << "MKL call \"" << name << "\" encountered error at " << file << ":"
-        << line << '\n';
-    Kokkos::abort(oss.str().c_str());
-  }
-}
-
-#define SPADD_MKL_SAFE_CALL(call) \
-  spadd_mkl_internal_safe_call(call, #call, __FILE__, __LINE__)
 #endif
 
 #if defined(KOKKOSKERNELS_INST_DOUBLE) &&     \
@@ -125,19 +111,19 @@ void run_experiment(const Params& params) {
   lno_t n = params.n;
   if (params.amtx.length()) {
     std::cout << "Loading A from " << params.amtx << '\n';
-    A = KokkosKernels::Impl::read_kokkos_crst_matrix<crsMat_t>(
+    A = KokkosSparse::Impl::read_kokkos_crst_matrix<crsMat_t>(
         params.amtx.c_str());
     m = A.numRows();
     n = A.numCols();
   } else {
     std::cout << "Randomly generating A\n";
     size_type nnzUnused = m * params.nnzPerRow;
-    A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
-        m, n, nnzUnused, 0, (n + 3) / 3);
+    A = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat_t>(m, n, nnzUnused,
+                                                                0, (n + 3) / 3);
   }
   if (params.bmtx.length()) {
     std::cout << "Loading B from " << params.bmtx << '\n';
-    B = KokkosKernels::Impl::read_kokkos_crst_matrix<crsMat_t>(
+    B = KokkosSparse::Impl::read_kokkos_crst_matrix<crsMat_t>(
         params.bmtx.c_str());
   } else if (params.bDiag) {
     std::cout << "Generating B as diagonal matrix.\n";
@@ -168,8 +154,8 @@ void run_experiment(const Params& params) {
   } else {
     std::cout << "Randomly generating B\n";
     size_type nnzUnused = m * params.nnzPerRow;
-    B = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
-        m, n, nnzUnused, 0, (n + 3) / 3);
+    B = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat_t>(m, n, nnzUnused,
+                                                                0, (n + 3) / 3);
   }
   // Make sure dimensions are compatible
   if (A.numRows() != B.numRows() || A.numCols() != B.numCols()) {
@@ -200,8 +186,8 @@ void run_experiment(const Params& params) {
   if (params.sorted) {
     std::cout << "Assuming input matrices are sorted (explicitly sorting just "
                  "in case)\n";
-    KokkosKernels::sort_crs_matrix(A);
-    KokkosKernels::sort_crs_matrix(B);
+    KokkosSparse::sort_crs_matrix(A);
+    KokkosSparse::sort_crs_matrix(B);
   } else
     std::cout << "Assuming input matrices are not sorted.\n";
   kh.create_spadd_handle(params.sorted);
@@ -259,11 +245,11 @@ void run_experiment(const Params& params) {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
   sparse_matrix_t Amkl, Bmkl, Cmkl;
   if (params.use_mkl) {
-    SPADD_MKL_SAFE_CALL(mkl_sparse_d_create_csr(
+    KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr(
         &Amkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)A.graph.row_map.data(),
         (int*)A.graph.row_map.data() + 1, A.graph.entries.data(),
         A.values.data()));
-    SPADD_MKL_SAFE_CALL(mkl_sparse_d_create_csr(
+    KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr(
         &Bmkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)B.graph.row_map.data(),
         (int*)B.graph.row_map.data() + 1, B.graph.entries.data(),
         B.values.data()));
@@ -326,9 +312,9 @@ void run_experiment(const Params& params) {
 #endif
       } else if (params.use_mkl) {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
-        SPADD_MKL_SAFE_CALL(mkl_sparse_d_add(SPARSE_OPERATION_NON_TRANSPOSE,
-                                             Amkl, 1.0, Bmkl, &Cmkl));
-        SPADD_MKL_SAFE_CALL(mkl_sparse_destroy(Cmkl));
+        KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_add(
+            SPARSE_OPERATION_NON_TRANSPOSE, Amkl, 1.0, Bmkl, &Cmkl));
+        KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(Cmkl));
 #endif
       } else {
         spadd_numeric(
@@ -351,8 +337,8 @@ void run_experiment(const Params& params) {
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
   if (params.use_mkl) {
-    SPADD_MKL_SAFE_CALL(mkl_sparse_destroy(Amkl));
-    SPADD_MKL_SAFE_CALL(mkl_sparse_destroy(Bmkl));
+    KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(Amkl));
+    KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(Bmkl));
   }
 #endif
 
@@ -377,8 +363,8 @@ void run_experiment(const Params& params) {
     std::cout << "Writing C (" << m << "x" << n << ") to " << params.cmtx
               << "\n";
     crsMat_t C("C", m, n, c_nnz, valuesC, row_mapC, entriesC);
-    KokkosKernels::Impl::write_kokkos_crst_matrix<crsMat_t>(
-        C, params.cmtx.c_str());
+    KokkosSparse::Impl::write_kokkos_crst_matrix<crsMat_t>(C,
+                                                           params.cmtx.c_str());
   }
 }
 
@@ -490,7 +476,9 @@ int main(int argc, char** argv) {
                           // as number of threads
   const int device_id = params.use_cuda - 1;
 
-  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+  Kokkos::initialize(Kokkos::InitializationSettings()
+                         .set_num_threads(num_threads)
+                         .set_device_id(device_id));
   // Kokkos::print_configuration(std::cout);
 
   // First, make sure that requested TPL (if any) is actually available
diff --git a/perf_test/sparse/KokkosSparse_spgemm.cpp b/perf_test/sparse/KokkosSparse_spgemm.cpp
index 9fada4caaa..da705fcdf2 100644
--- a/perf_test/sparse/KokkosSparse_spgemm.cpp
+++ b/perf_test/sparse/KokkosSparse_spgemm.cpp
@@ -294,7 +294,9 @@ int main(int argc, char** argv) {
   const int device_id =
       params.use_cuda ? params.use_cuda - 1 : params.use_hip - 1;
 
-  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+  Kokkos::initialize(Kokkos::InitializationSettings()
+                         .set_num_threads(num_threads)
+                         .set_device_id(device_id));
   Kokkos::print_configuration(std::cout);
 
 #if defined(KOKKOS_ENABLE_OPENMP)
diff --git a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp
index 98942acb27..aa3969e6c8 100644
--- a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp
+++ b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp
@@ -259,7 +259,9 @@ int main(int argc, char** argv) {
   const int num_threads = std::max(params.use_openmp, params.use_threads);
   const int device_id   = params.use_cuda - 1;
 
-  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+  Kokkos::initialize(Kokkos::InitializationSettings()
+                         .set_num_threads(num_threads)
+                         .set_device_id(device_id));
   Kokkos::print_configuration(std::cout);
 
 #if defined(KOKKOS_ENABLE_OPENMP)
diff --git a/perf_test/sparse/KokkosSparse_spiluk.cpp b/perf_test/sparse/KokkosSparse_spiluk.cpp
index d381b9b888..b86ecc352f 100644
--- a/perf_test/sparse/KokkosSparse_spiluk.cpp
+++ b/perf_test/sparse/KokkosSparse_spiluk.cpp
@@ -58,13 +58,14 @@
 
 #include <Kokkos_Core.hpp>
 
-#include "KokkosKernels_SparseUtils.hpp"
+#include "KokkosSparse_Utils.hpp"
 #include "KokkosSparse_spiluk.hpp"
 #include "KokkosSparse_spmv.hpp"
 #include "KokkosBlas1_nrm2.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
 #include "KokkosKernels_default_types.hpp"
 #include <KokkosKernels_IOUtils.hpp>
+#include <KokkosSparse_IOUtils.hpp>
 
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) && \
     (!defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION))
@@ -111,7 +112,7 @@ int test_spiluk_perf(std::vector<int> tests, std::string afilename, int kin,
   if (!afilename.empty()) {
     std::cout << "ILU(K) Begin: Read matrix filename " << afilename
               << std::endl;
-    crsmat_t A = KokkosKernels::Impl::read_kokkos_crst_matrix<crsmat_t>(
+    crsmat_t A = KokkosSparse::Impl::read_kokkos_crst_matrix<crsmat_t>(
         afilename.c_str());           // in_matrix
     graph_t graph         = A.graph;  // in_graph
     const size_type nrows = graph.numRows();
@@ -257,6 +258,10 @@ int test_spiluk_perf(std::vector<int> tests, std::string afilename, int kin,
                 << std::endl;
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+      // cuSPARSE requires lno_t = size_type = int. For both, int is always used
+      // (if enabled)
+#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+    defined(KOKKOSKERNELS_INST_OFFSET_INT)
       if (fill_lev == 0) {
         std::cout << "CUSPARSE: No KK interface added yet" << std::endl;
 
@@ -412,6 +417,7 @@ int test_spiluk_perf(std::vector<int> tests, std::string afilename, int kin,
         }  // end row
         std::cout << "ILU(0) SUCCESS!" << std::endl;
       }  // fill_lev=0
+#endif
 #endif
 
       // Benchmark
@@ -436,6 +442,10 @@ int test_spiluk_perf(std::vector<int> tests, std::string afilename, int kin,
       std::cout << "LOOP_MIN_TIME:  " << min_time << std::endl;
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+      // cuSPARSE requires lno_t = size_type = int. For both, int is always used
+      // (if enabled)
+#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+    defined(KOKKOSKERNELS_INST_OFFSET_INT)
       if (fill_lev == 0) {
         lno_view_t A_row_map("A_row_map", nrows + 1);
         lno_nnz_view_t A_entries("A_entries", nnz);
@@ -465,15 +475,21 @@ int test_spiluk_perf(std::vector<int> tests, std::string afilename, int kin,
         std::cout << "LOOP_MAX_TIME (cuSPARSE):  " << max_time << std::endl;
         std::cout << "LOOP_MIN_TIME (cuSPARSE):  " << min_time << std::endl;
       }  // fill_lev=0
+#endif
 #endif
     }  // end tests
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+    // cuSPARSE requires lno_t = size_type = int. For both, int is always used
+    // (if enabled)
+#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+    defined(KOKKOSKERNELS_INST_OFFSET_INT)
     // step 6: free resources
     cudaFree(pBuffer);
     cusparseDestroyCsrilu02Info(info);
     cusparseDestroyMatDescr(descr);
     cusparseDestroy(handle);
+#endif
 #endif
   }  // end if (!afilename.empty())
 
diff --git a/perf_test/sparse/KokkosSparse_spmv.cpp b/perf_test/sparse/KokkosSparse_spmv.cpp
index 6b67905adc..9eec6181a7 100644
--- a/perf_test/sparse/KokkosSparse_spmv.cpp
+++ b/perf_test/sparse/KokkosSparse_spmv.cpp
@@ -55,6 +55,7 @@
 #include <Kokkos_Core.hpp>
 #include <KokkosSparse_CrsMatrix.hpp>
 #include <KokkosKernels_IOUtils.hpp>
+#include <KokkosSparse_IOUtils.hpp>
 #include <KokkosSparse_spmv.hpp>
 #include "KokkosKernels_default_types.hpp"
 #include <spmv/KokkosKernels_spmv_data.hpp>
@@ -90,12 +91,12 @@ int test_crs_matrix_singlevec(Ordinal numRows, Ordinal numCols, int test,
   srand(17312837);
   matrix_type A;
   if (filename)
-    A = KokkosKernels::Impl::read_kokkos_crst_matrix<matrix_type>(filename);
+    A = KokkosSparse::Impl::read_kokkos_crst_matrix<matrix_type>(filename);
   else {
     Offset nnz = 10 * numRows;
     // note: the help text says the bandwidth is fixed at 0.01 * numRows
     // CAVEAT:  small problem sizes are problematic, b/c of 0.01*numRows
-    A = KokkosKernels::Impl::kk_generate_sparse_matrix<matrix_type>(
+    A = KokkosSparse::Impl::kk_generate_sparse_matrix<matrix_type>(
         numRows, numCols, nnz, 0, 0.01 * numRows);
   }
   SPMVTestData test_data = setup_test(&data, A, rows_per_thread, team_size,
diff --git a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp
index ca16f2067e..c578c269f8 100644
--- a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp
+++ b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp
@@ -603,9 +603,13 @@ int main(int argc, char** argv) {
           &vecY, y1.extent_int(0), (void*)y1.data(), myCudaDataType));
 
       const double alpha = 1.0, beta = 1.0;
-      size_t bufferSize     = 0;
-      void* dBuffer         = NULL;
+      size_t bufferSize = 0;
+      void* dBuffer     = NULL;
+#if CUSPARSE_VERSION >= 11201
+      cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT;
+#else
       cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT;
+#endif
       KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize(
           controls.getCusparseHandle(), CUSPARSE_OPERATION_NON_TRANSPOSE,
           &alpha, A_cusparse, vecX, &beta, vecY, myCudaDataType, alg,
diff --git a/perf_test/sparse/KokkosSparse_sptrsv.cpp b/perf_test/sparse/KokkosSparse_sptrsv.cpp
index c6787242d9..a27ed3f6d2 100644
--- a/perf_test/sparse/KokkosSparse_sptrsv.cpp
+++ b/perf_test/sparse/KokkosSparse_sptrsv.cpp
@@ -58,12 +58,13 @@
 
 #include <Kokkos_Core.hpp>
 
-#include "KokkosKernels_SparseUtils.hpp"
+#include "KokkosSparse_Utils.hpp"
 #include "KokkosSparse_sptrsv.hpp"
 #include "KokkosSparse_spmv.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
 #include "KokkosKernels_default_types.hpp"
 #include <KokkosKernels_IOUtils.hpp>
+#include "KokkosSparse_IOUtils.hpp"
 
 //#define INTERNAL_CUSPARSE
 
@@ -159,7 +160,7 @@ int test_sptrsv_perf(std::vector<int> tests, const std::string &lfilename,
   if (!lfilename.empty()) {
     std::cout << "Lower Tri Begin: Read matrix filename " << lfilename
               << std::endl;
-    crsmat_t triMtx = KokkosKernels::Impl::read_kokkos_crst_matrix<crsmat_t>(
+    crsmat_t triMtx = KokkosSparse::Impl::read_kokkos_crst_matrix<crsmat_t>(
         lfilename.c_str());                // in_matrix
     graph_t graph         = triMtx.graph;  // in_graph
     const size_type nrows = graph.numRows();
@@ -567,7 +568,7 @@ int test_sptrsv_perf(std::vector<int> tests, const std::string &lfilename,
   if (!ufilename.empty()) {
     std::cout << "Upper Tri Begin: Read matrix filename " << ufilename
               << std::endl;
-    crsmat_t triMtx = KokkosKernels::Impl::read_kokkos_crst_matrix<crsmat_t>(
+    crsmat_t triMtx = KokkosSparse::Impl::read_kokkos_crst_matrix<crsmat_t>(
         ufilename.c_str());                // in_matrix
     graph_t graph         = triMtx.graph;  // in_graph
     const size_type nrows = graph.numRows();
diff --git a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp
index 039c88e9c1..b77f0b1d07 100644
--- a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp
+++ b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp
@@ -43,9 +43,10 @@
 */
 
 #include "Kokkos_Random.hpp"
-#include "KokkosKernels_SparseUtils.hpp"
+#include "KokkosSparse_Utils.hpp"
 #include "KokkosSparse_spmv.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
+#include "KokkosSparse_IOUtils.hpp"
 
 #include "KokkosSparse_sptrsv.hpp"
 #include "KokkosSparse_sptrsv_supernode.hpp"
@@ -58,12 +59,8 @@
 
 #include "KokkosSparse_sptrsv_aux.hpp"
 
-using namespace KokkosKernels;
-using namespace KokkosKernels::Impl;
-using namespace KokkosKernels::Experimental;
-using namespace KokkosSparse;
-using namespace KokkosSparse::Experimental;
-using namespace KokkosSparse::PerfTest::Experimental;
+namespace KSExp = KokkosSparse::Experimental;
+namespace KSPTE = KokkosSparse::PerfTest::Experimental;
 
 enum {
   CUSPARSE,
@@ -130,7 +127,7 @@ int test_sptrsv_perf(std::vector<int> tests, bool verbose,
     std::cout << " > Read a triangular-matrix filename " << matrix_filename
               << std::endl;
     host_crsmat_t M =
-        KokkosKernels::Impl::read_kokkos_crst_matrix<host_crsmat_t>(
+        KokkosSparse::Impl::read_kokkos_crst_matrix<host_crsmat_t>(
             matrix_filename.c_str());
     const size_type nrows = M.graph.numRows();
     // transpose the matrix to be stored in CCS
@@ -153,10 +150,10 @@ int test_sptrsv_perf(std::vector<int> tests, bool verbose,
     cols_view_t entries("colmap_view", nnzL);
     values_view_t values("values_view", nnzL);
     // transpose L
-    transpose_matrix<in_row_map_view_t, in_cols_view_t, in_values_view_t,
-                     row_map_view_t, cols_view_t, values_view_t, row_map_view_t,
-                     host_execution_space>(nrows, nrows, row_mapM, entriesM,
-                                           valuesM, row_map, entries, values);
+    KokkosSparse::Impl::transpose_matrix<
+        in_row_map_view_t, in_cols_view_t, in_values_view_t, row_map_view_t,
+        cols_view_t, values_view_t, row_map_view_t, host_execution_space>(
+        nrows, nrows, row_mapM, entriesM, valuesM, row_map, entries, values);
 
     // store L in CSC
     host_graph_t static_graph(entries, row_map);
@@ -211,24 +208,24 @@ int test_sptrsv_perf(std::vector<int> tests, bool verbose,
           if (test == SUPERNODAL_NAIVE) {
             std::cout << " > create handle for SUPERNODAL_NAIVE" << std::endl
                       << std::endl;
-            khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows,
-                                     true);
-            khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows,
-                                     true);
+            khL.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_NAIVE,
+                                     nrows, true);
+            khU.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_NAIVE,
+                                     nrows, true);
           } else if (test == SUPERNODAL_DAG) {
             std::cout << " > create handle for SUPERNODAL_DAG" << std::endl
                       << std::endl;
-            khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows,
-                                     true);
-            khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows,
-                                     true);
+            khL.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_DAG,
+                                     nrows, true);
+            khU.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_DAG,
+                                     nrows, true);
           } else if (test == SUPERNODAL_SPMV_DAG) {
             std::cout << " > create handle for SUPERNODAL_SPMV_DAG" << std::endl
                       << std::endl;
-            khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG,
-                                     nrows, true);
-            khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG,
-                                     nrows, true);
+            khL.create_sptrsv_handle(
+                KSExp::SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG, nrows, true);
+            khU.create_sptrsv_handle(
+                KSExp::SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG, nrows, true);
           }
           // verbose (optional, default is false)
           khL.set_sptrsv_verbose(verbose);
@@ -253,13 +250,13 @@ int test_sptrsv_perf(std::vector<int> tests, bool verbose,
           // graph/dag)
           khU.get_sptrsv_handle()->set_column_major(
               !khL.get_sptrsv_handle()->is_column_major());
-          sptrsv_supernodal_symbolic(nsuper, supercols.data(), etree, L.graph,
-                                     &khL, L.graph, &khU);
+          KSExp::sptrsv_supernodal_symbolic(nsuper, supercols.data(), etree,
+                                            L.graph, &khL, L.graph, &khU);
 
           // ==============================================
           // do numeric compute (copy numerical values from SuperLU data
           // structure to our sptrsv data structure)
-          sptrsv_compute(&khL, L);
+          KSExp::sptrsv_compute(&khL, L);
 
           // ==============================================
           // Preaparing for the first solve
@@ -283,7 +280,7 @@ int test_sptrsv_perf(std::vector<int> tests, bool verbose,
           // ==============================================
           // do L solve
           timer.reset();
-          sptrsv_solve(&khL, sol, rhs);
+          KSExp::sptrsv_solve(&khL, sol, rhs);
           Kokkos::fence();
           std::cout << " > Lower-TRI: " << std::endl;
           std::cout << "   Solve Time   : " << timer.seconds() << std::endl;
@@ -295,7 +292,7 @@ int test_sptrsv_perf(std::vector<int> tests, bool verbose,
           // Error Check ** on host **
           Kokkos::fence();
           std::cout << std::endl;
-          if (!check_errors(tol, A, rhs_host, sol_host)) {
+          if (!KSPTE::check_errors(tol, A, rhs_host, sol_host)) {
             num_failed++;
           }
 
@@ -307,7 +304,7 @@ int test_sptrsv_perf(std::vector<int> tests, bool verbose,
           Kokkos::fence();
           for (int i = 0; i < loop; i++) {
             timer.reset();
-            sptrsv_solve(&khL, sol, rhs);
+            KSExp::sptrsv_solve(&khL, sol, rhs);
             Kokkos::fence();
             double time = timer.seconds();
             ave_time += time;
diff --git a/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp b/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp
index c32968c177..3a631fc743 100644
--- a/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp
+++ b/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp
@@ -143,7 +143,7 @@ void kk_inspector_matvec(AType A, XType x, YType y, int team_size,
     workset_offsets(0)    = 0;
     lno_t ws              = 1;
     for (lno_t row = 0; row < A.numRows(); row++) {
-      if (A.graph.row_map(row) > ws * nnz_per_workset) {
+      if (A.graph.row_map(row) > size_type(ws) * nnz_per_workset) {
         workset_offsets(ws) = row;
         ws++;
       }
diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia
index c049e6b721..db7289619d 100755
--- a/scripts/cm_test_all_sandia
+++ b/scripts/cm_test_all_sandia
@@ -20,6 +20,7 @@ print_help() {
   echo "--spack: Run spack builds rather than direct CMake tests"
   echo ""
   echo "--debug: Run tests in debug. Defaults to False"
+  echo "--deprecated-code: Enable deprecated code (disabled by default)"
   echo "--boundscheck: Enable Kokkos_ENABLE_DEBUG_BOUNDS_CHECK to check View accesses within bounds."
   echo "--test-script: Test this script, not Kokkos"
   echo "--skip-hwloc: Do not do hwloc tests"
@@ -50,8 +51,8 @@ print_help() {
   echo "--build-list=BUILD,BUILD,BUILD..."
   echo "    Provide a comma-separated list of builds instead of running all builds"
   echo "    Valid items:"
-  echo "      OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial"
-  echo "      Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
+  echo "      OpenMP, Threads, Serial, OpenMP_Serial, Threads_Serial"
+  echo "      Cuda_OpenMP, Cuda_Threads, Cuda_Serial"
   echo ""
   echo "--with-scalars=SCALARS: set KOKKOSKERNELS_SCALARS"
   echo "    Provide a comma-separated list scalar types"
@@ -183,12 +184,12 @@ fi
 
 echo "Running on machine: $MACHINE"
 
-GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
+GCC_BUILD_LIST="OpenMP,Threads,Serial,OpenMP_Serial,Threads_Serial"
 IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
 ARM_GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
-INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
-CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial"
-CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial"
+INTEL_BUILD_LIST="OpenMP,Threads,Serial,OpenMP_Serial,Threads_Serial"
+CLANG_BUILD_LIST="Threads,Serial,Threads_Serial"
+CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Threads,Cuda_Serial"
 CUDA_IBM_BUILD_LIST="Cuda_OpenMP,Cuda_Serial"
 
 GCC_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized"
@@ -266,6 +267,9 @@ KOKKOSKERNELS_OFFSETS="int,size_t"
 KOKKOSKERNELS_LAYOUTS="LayoutLeft"
 
 CTESTTIMEOUT=2500
+
+KOKKOS_DEPRECATED_CODE=""
+
 #
 # Handle arguments.
 #
@@ -290,6 +294,9 @@ do
     --boundscheck*)
       KOKKOS_BOUNDS_CHECK="--boundscheck"
       ;;
+    --deprecated-code)
+      KOKKOS_DEPRECATED_CODE="--deprecated-code"
+      ;;
     --build-only*)
       BUILD_ONLY=True
       ;;
@@ -526,7 +533,7 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then
     COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS"
                "gcc/7.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
                "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-               "clang/5.0.1 $CLANG_BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
+               "clang/5.0.1 $CLANG_BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS"
                "clang/7.0.1 $CLANG7_MODULE_LIST "Cuda_OpenMP" clang++ $CLANG_WARNING_FLAGS"
                "cuda/9.2 $CUDA9_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
@@ -535,7 +542,7 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then
     COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS"
                "gcc/7.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
                "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-               "clang/5.0.1 $CLANG_BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
+               "clang/5.0.1 $CLANG_BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS"
                "clang/7.0.1 $CLANG7_MODULE_LIST "Cuda_OpenMP" clang++ $CLANG_WARNING_FLAGS"
                "cuda/9.2 $CUDA9_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
@@ -564,9 +571,9 @@ elif [ "$MACHINE" = "inouye" ]; then
   MODULE_ENVIRONMENT="module purge"
   eval "$MODULE_ENVIRONMENT"
   SKIP_HWLOC=True
-  export OMP_PROC_BIND=close
-  export OMP_PLACES=cores
-  export OMP_NUM_THREADS=48
+  export omp_proc_bind=close
+  export omp_places=cores
+  export omp_num_threads=47
 
   BASE_MODULE_LIST="cmake/3.17.0,<COMPILER_NAME>/<COMPILER_VERSION>"
 
@@ -620,7 +627,7 @@ elif [ "$MACHINE" = "white" ]; then
   CUDA10_MODULE_TPL_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.4.0,openblas/0.3.4/gcc/7.4.0"
   IBM_MODULE_TPL_LIST="cmake/3.19.3,<COMPILER_NAME>/xl/<COMPILER_VERSION>,gcc/7.2.0,netlib/3.8.0/ibm/xl/16.1.1"
 
-  # Don't do pthread on white.
+  # Don't do Threads on white.
   GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
 
   # Don't run the IBM toolchain with CXX14 on white
@@ -672,13 +679,15 @@ elif [ "$MACHINE" = "weaver" ]; then
   GCC74_MODULE_TPL_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,openblas/0.2.20/gcc/7.2.0,gcc/7.4.0"
   CUDA_MODULE_TPL_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0"
   CUDA10_MODULE_TPL_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,openblas/0.2.20/gcc/7.2.0"
+  # Cuda/11 modules available only on the dev queue (rhel8 OS); gcc/8.3.1 load by default
+  CUDA11_MODULE_LIST="cmake/3.21.2,<COMPILER_NAME>/<COMPILER_VERSION>,openblas/0.3.18/gcc/8.3.1"
     # Issues finding CUBLAS with cuda/10.1.243 module at configure
     # "Could NOT find TPLCUBLAS (missing: CUDA_CUBLAS_LIBRARIES)"
     # Once resolved add the compiler + modules below to the SPOT_CHECK_TPLS
 #               "cuda/10.1.243 $CUDA10_MODULE_TPL_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
 
 
-  # Don't do pthread on weaver
+  # Don't do Threads on weaver
   GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
 
   if [ "$SPOT_CHECK" = "True" ]; then
@@ -707,6 +716,8 @@ elif [ "$MACHINE" = "weaver" ]; then
                "cuda/10.1.105 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/10.1.243 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/10.2.089 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/10.2.2 $CUDA11_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/11.2.2 $CUDA11_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   fi
 
@@ -756,6 +767,8 @@ elif [ "$MACHINE" = "caraway" ]; then
   #   output description and success based only on build succes; build time output (no run-time)
 
   BASE_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>"
+  # Cuda11 usage available on the V100 queue
+  CUDA11_MODULE_LIST="cmake/3.22.2,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/8.2.0"
 
   HIPCLANG_BUILD_LIST="Hip_Serial"
   HIPCLANG_WARNING_FLAGS=""
@@ -763,6 +776,12 @@ elif [ "$MACHINE" = "caraway" ]; then
   # Format: (compiler module-list build-list exe-name warning-flag)
   COMPILERS=("rocm/4.3.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS"
              "rocm/4.5.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS"
+             "cuda/11.4 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+             "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
   )
 
   if [ -z "$ARCH_FLAG" ]; then
@@ -789,14 +808,14 @@ elif [ "$MACHINE" = "blake" ]; then
       #"intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
       #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS"
     COMPILERS=("intel/19.1.144 $BASE_MODULE_LIST_INTEL "OpenMP_Serial" icpc $INTEL_WARNING_FLAGS"
-               "gcc/7.2.0 $BASE_MODULE_LIST "Pthread_Serial,OpenMP" g++ $GCC_WARNING_FLAGS"
-               "clang/10.0.1 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
+               "gcc/7.2.0 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GCC_WARNING_FLAGS"
+               "clang/10.0.1 $BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS"
     )
   elif [ "$SPOT_CHECK_TPLS" = "True" ]; then
       # Format: (compiler module-list build-list exe-name warning-flag)
       # TODO: Failing toolchains:
       #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS"
-    COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL "OpenMP,Pthread" icpc $INTEL_WARNING_FLAGS"
+    COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS"
                "gcc/7.2.0 $GCC72_MODULE_TPL_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS"
     )
   else
@@ -845,36 +864,36 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then
 
   CLANG8_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/10.0"
 
-  BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_Pthread"
+  BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_Threads"
   BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_OpenMP"
-  BUILD_LIST_CLANG="Serial,Pthread,OpenMP"
+  BUILD_LIST_CLANG="Serial,Threads,OpenMP"
 
   CLANG8_CUDA_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized,-Wno-pass-failed"
 
   if [ "$SPOT_CHECK" = "True" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS"
+    COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Threads" g++ $GCC_WARNING_FLAGS"
                "gcc/8.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
                "gcc/9.1 $GCC91_MODULE_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS"
                "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "Serial" icpc $INTEL_WARNING_FLAGS"
                "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-               "intel/19.0.5 $INTEL_BASE_MODULE_LIST "Pthread" icpc $INTEL_WARNING_FLAGS"
-               "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Pthread_Serial" clang++ $CLANG8_CUDA_WARNING_FLAGS"
-               "clang/9.0.0 $CLANG_BASE_MODULE_LIST "Serial,Pthread" clang++ $CLANG_WARNING_FLAGS"
+               "intel/19.0.5 $INTEL_BASE_MODULE_LIST "Threads" icpc $INTEL_WARNING_FLAGS"
+               "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Threads_Serial" clang++ $CLANG8_CUDA_WARNING_FLAGS"
+               "clang/9.0.0 $CLANG_BASE_MODULE_LIST "Serial,Threads" clang++ $CLANG_WARNING_FLAGS"
                "cuda/10.1 $NVCC_SEMSMODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/9.2 $NVCC_SEMSMODULE_LIST "Cuda_Serial" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/11.0 $NVCC11_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   elif [ "$SPOT_CHECK_TPLS" = "True" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS"
+    COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Threads" g++ $GCC_WARNING_FLAGS"
                "gcc/8.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
                "gcc/9.1 $GCC91_MODULE_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS"
                "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "Serial" icpc $INTEL_WARNING_FLAGS"
                "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-               "intel/19.0.5 $INTEL_BASE_MODULE_LIST "Pthread" icpc $INTEL_WARNING_FLAGS"
-               "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Pthread_Serial" clang++ $CLANG8_CUDA_WARNING_FLAGS"
-               "clang/9.0.0 $CLANG_BASE_MODULE_LIST "Serial,Pthread" clang++ $CLANG_WARNING_FLAGS"
+               "intel/19.0.5 $INTEL_BASE_MODULE_LIST "Threads" icpc $INTEL_WARNING_FLAGS"
+               "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Threads_Serial" clang++ $CLANG8_CUDA_WARNING_FLAGS"
+               "clang/9.0.0 $CLANG_BASE_MODULE_LIST "Serial,Threads" clang++ $CLANG_WARNING_FLAGS"
                "cuda/10.1 $NVCC_SEMSMODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/11.0 $NVCC11_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
@@ -916,9 +935,9 @@ else
   exit 1
 fi
 
-export OMP_NUM_THREADS=8
-export OMP_PROC_BIND=spread
-export OMP_PLACES=cores
+export OMP_NUM_THREADS=${omp_num_threads:=8}
+export OMP_PROC_BIND=${omp_proc_bind:=spread}
+export OMP_PLACES=${omp_places:=cores}
 
 declare -i NUM_RESULTS_TO_KEEP=7
 
@@ -1318,13 +1337,13 @@ single_build_and_test() {
 
     # KOKKOS_OPTIONS and KOKKOS_CUDA_OPTIONS are exported and detected by kokkos' generate_makefile.sh during install of kokkos; we pass them to the reproducer script instructions
     echo "  #   Use generate_makefile line below to call cmake which generates makefile for this build:" &> call_generate_makefile.sh
-    echo "        ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples $extra_args" &>> call_generate_makefile.sh
+    echo "        ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} $extra_args" &>> call_generate_makefile.sh
     chmod +x call_generate_makefile.sh
 
     # script command with generic path for faster copy/paste of reproducer into issues
-    echo "  #     \$KOKKOSKERNELS_PATH/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --kokkoskernels-path=\$KOKKOSKERNELS_PATH --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples $extra_args" &> call_generate_makefile_genericpath.sh
+    echo "  #     \$KOKKOSKERNELS_PATH/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --kokkoskernels-path=\$KOKKOSKERNELS_PATH --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} $extra_args" &> call_generate_makefile_genericpath.sh
 
-    run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
+    run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
 
     local make_par_lvl=12
     if [[ "$MACHINE" = white* ]]; then
diff --git a/scripts/docker/Dockerfile.sycl b/scripts/docker/Dockerfile.sycl
index 888a36d510..3d94a1a45e 100644
--- a/scripts/docker/Dockerfile.sycl
+++ b/scripts/docker/Dockerfile.sycl
@@ -1,6 +1,8 @@
 ARG BASE=nvidia/cuda:10.2-devel
 FROM $BASE
 
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+
 RUN apt-get update && apt-get install -y \
         bc \
         wget \
@@ -36,8 +38,8 @@ RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSIO
 ENV PATH=${CMAKE_DIR}/bin:$PATH
 
 ENV SYCL_DIR=/opt/sycl
-RUN SYCL_VERSION=2021-09 && \
-    SYCL_URL=https://github.com/intel/llvm/archive && \
+RUN SYCL_VERSION=20220112 && \
+    SYCL_URL=https://github.com/intel/llvm/archive/sycl-nightly && \
     SYCL_ARCHIVE=${SYCL_VERSION}.tar.gz && \
     SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \
     wget --quiet ${SYCL_URL}/${SYCL_ARCHIVE} && \
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a3460d1413..a1c938aed5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -52,7 +52,7 @@ IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL)
   APPEND_GLOB(SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/impl/tpls/KokkosBlas_Host_tpl.cpp)
 ENDIF()
 
-include(kokkoskernels_eti.cmake)
+include(cmake/kokkoskernels_eti.cmake)
 SET(ETI_HEADERS)
 
 #Build up a list of DECL, AVAIL, and INST macros
@@ -360,6 +360,13 @@ KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_numeric spgemm_numeric
   TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE
 )
 
+KOKKOSKERNELS_GENERATE_ETI(Sparse_bspgemm_numeric bspgemm_numeric
+  COMPONENTS  sparse
+  HEADER_LIST ETI_HEADERS
+  SOURCE_LIST SOURCES
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE
+)
+
 KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_jacobi spgemm_jacobi
   COMPONENTS  sparse
   HEADER_LIST ETI_HEADERS
@@ -367,6 +374,22 @@ KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_jacobi spgemm_jacobi
   TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE
 )
 
+# NOTE: SpAdd symbolic doesn't use scalars directly,
+# but it needs the type to use handles.
+KOKKOSKERNELS_GENERATE_ETI(Sparse_spadd_symbolic spadd_symbolic
+  COMPONENTS  sparse
+  HEADER_LIST ETI_HEADERS
+  SOURCE_LIST SOURCES
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
+)
+
+KOKKOSKERNELS_GENERATE_ETI(Sparse_spadd_numeric spadd_numeric
+  COMPONENTS  sparse
+  HEADER_LIST ETI_HEADERS
+  SOURCE_LIST SOURCES
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
+)
+
 KOKKOSKERNELS_GENERATE_ETI(Sparse_spiluk_symbolic spiluk_symbolic
   COMPONENTS  sparse
   HEADER_LIST ETI_HEADERS
@@ -416,6 +439,13 @@ KOKKOSKERNELS_GENERATE_ETI(Sparse_gauss_seidel_apply gauss_seidel_apply
   TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE
 )
 
+KOKKOSKERNELS_GENERATE_ETI(Graph_color_d1 color_d1
+  COMPONENTS  graph
+  HEADER_LIST ETI_HEADERS
+  SOURCE_LIST SOURCES
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE
+)
+
 LIST(APPEND HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h)
 
 #Add a few other utility files
diff --git a/src/KokkosKernels_Macros.hpp b/src/KokkosKernels_Macros.hpp
index 1630028c54..67d86b6e0e 100644
--- a/src/KokkosKernels_Macros.hpp
+++ b/src/KokkosKernels_Macros.hpp
@@ -66,9 +66,10 @@
 // https://clang.llvm.org/docs/OpenMPSupport.html#id1
 #if defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG)
 // GCC 4.8.5 and older do not support #pragma omp simd
-// Do not enable when using GCC 7.2.0 + C++17 due to a bug in gcc
-#if (KOKKOS_COMPILER_GNU > 485) && \
-    !(KOKKOS_COMPILER_GNU == 720 && defined(KOKKOS_ENABLE_CXX17))
+// Do not enable when using GCC 7.2.0 or 7.3.0 + C++17 due to a bug in gcc
+#if (KOKKOS_COMPILER_GNU > 485) &&                                   \
+    !(KOKKOS_COMPILER_GNU == 720 && defined(KOKKOS_ENABLE_CXX17)) && \
+    !(KOKKOS_COMPILER_GNU == 730 && defined(KOKKOS_ENABLE_CXX17))
 #define KOKKOSKERNELS_ENABLE_OMP_SIMD
 #endif
 // TODO: Check for a clang version that supports #pragma omp simd
diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp
deleted file mode 100644
index 68bcdf79ea..0000000000
--- a/src/Kokkos_ArithTraits.hpp
+++ /dev/null
@@ -1,3979 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_ARITHTRAITS_HPP
-#define KOKKOS_ARITHTRAITS_HPP
-
-/// \file Kokkos_ArithTraits.hpp
-/// \brief Declaration and definition of Kokkos::Details::ArithTraits
-
-#include <KokkosKernels_config.h>
-#include <Kokkos_Complex.hpp>
-#include <KokkosKernels_Half.hpp>
-#include <Kokkos_Macros.hpp>
-
-#ifdef HAVE_KOKKOSKERNELS_QUADMATH
-#include <quadmath.h>
-#endif  // HAVE_KOKKOSKERNELS_QUADMATH
-
-#include <cfloat>
-#include <climits>
-#include <cmath>
-#include <complex>  // std::complex
-#include <limits>   // std::numeric_limits
-#ifdef __CUDACC__
-#include <math_constants.h>
-#endif
-
-namespace {  // anonymous
-
-/// \fn intPowImpl
-/// \tparam IntType A built-in integer type.
-/// \brief Implementation of intPowSigned and intPowUnsigned.
-///
-/// \pre x != 0
-/// \pre y > 0
-///
-/// Use intPowSigned or intPowUnsigned for general y.
-template <class IntType>
-KOKKOS_FORCEINLINE_FUNCTION IntType intPowImpl(const IntType x,
-                                               const IntType y) {
-  // Recursion (unrolled into while loop): pow(x, 2y) = (x^y)^2
-  IntType prod  = x;
-  IntType y_cur = 1;
-  // If y == 1, then prod stays x.
-  while (y_cur < y) {
-    prod  = prod * prod;
-    y_cur = y_cur << 1;
-  }
-  // abs(y - y_cur) < floor(log2(y)), so it won't hurt asymptotic run
-  // time to finish the remainder in a linear iteration.
-  if (y > y_cur) {
-    const IntType left = y - y_cur;
-    for (IntType k = 0; k < left; ++k) {
-      prod = prod * x;
-    }
-  } else if (y < y_cur) {
-    // There's probably a better way to do this in order to avoid the
-    // (expensive) integer division, but I'm not motivated to think of
-    // it at the moment.
-    const IntType left = y_cur - y;
-    for (IntType k = 0; k < left; ++k) {
-      prod = prod / x;
-    }
-  }
-  return prod;
-
-  // y = 8:
-  //
-  // x,1   -> x^2,2
-  // x^2,2 -> x^4,4
-  // x^4,4 -> x^8,8
-  //
-  // y = 9:
-  //
-  // x,1   -> x^2,2
-  // x^2,2 -> x^4,4
-  // x^4,4 -> x^8,8
-  //
-  // y - y_cur is what's left over.  Just do it one at a time.
-  //
-  // y = 3:
-  // x,1   -> x^2,2
-  // x^2,2 -> x^4,4
-}
-
-// Warning free abs function for types where we don't know whether they are
-// signed (like char)
-template <class T, bool is_signed = std::numeric_limits<T>::is_signed>
-struct integer_abs {
-  static KOKKOS_INLINE_FUNCTION T abs(const T& val);
-};
-
-template <class T>
-struct integer_abs<T, true> {
-  static KOKKOS_INLINE_FUNCTION T abs(const T& x) { return x < 0 ? -x : x; }
-};
-
-template <class T>
-struct integer_abs<T, false> {
-  static KOKKOS_INLINE_FUNCTION T abs(const T& x) { return x; }
-};
-
-/// \fn intPowSigned
-/// \tparam IntType A built-in signed integer type.
-/// \brief Compute x raised to the power y.
-///
-/// If the arguments are invalid (e.g., if x and y are both zero), the
-/// result of this function is undefined.  However, this function will
-/// not throw an exception in that case.
-template <class IntType>
-KOKKOS_FORCEINLINE_FUNCTION
-    typename std::enable_if<std::numeric_limits<IntType>::is_signed,
-                            IntType>::type
-    intPowSigned(const IntType x, const IntType y) {
-  // It's not entirely clear what to return if x and y are both zero.
-  // In the case of floating-point numbers, 0^0 is NaN.  Here, though,
-  // I think it's safe to return 0.
-  if (x == 0) {
-    return 0;
-  } else if (y == 0) {
-    return 1;
-  } else if (y < 0) {
-    if (x == 1) {
-      return 1;
-    } else if (x == -1) {
-      return (y % 2 == 0) ? 1 : -1;
-    } else {
-      return 0;  // round the fraction to zero
-    }
-  }
-  return intPowImpl<IntType>(x, y);
-}
-template <class IntType>
-KOKKOS_FORCEINLINE_FUNCTION
-    typename std::enable_if<!std::numeric_limits<IntType>::is_signed,
-                            IntType>::type
-    intPowSigned(const IntType x, const IntType y) {
-  // It's not entirely clear what to return if x and y are both zero.
-  // In the case of floating-point numbers, 0^0 is NaN.  Here, though,
-  // I think it's safe to return 0.
-  if (x == 0) {
-    return 0;
-  } else if (y == 0) {
-    return 1;
-  }
-  return intPowImpl<IntType>(x, y);
-}
-
-/// \fn intPowUnsigned
-/// \tparam IntType A built-in unsigned integer type.
-/// \brief Compute x raised to the power y.
-///
-/// If the arguments are invalid (e.g., if x and y are both zero), the
-/// result of this function is undefined.  However, this function will
-/// not throw an exception in that case.
-template <class IntType>
-KOKKOS_FORCEINLINE_FUNCTION IntType intPowUnsigned(const IntType x,
-                                                   const IntType y) {
-  // It's not entirely clear what to return if x and y are both zero.
-  // In the case of floating-point numbers, 0^0 is NaN.  Here, though,
-  // I think it's safe to return 0.
-  if (x == 0) {
-    return 0;
-  } else if (y == 0) {
-    return 1;
-  } else {
-    return intPowImpl<IntType>(x, y);
-  }
-}
-
-// It might make sense to use special sqrt() approximations for
-// integer arguments, like those presented on the following web site:
-//
-// http://www.azillionmonkeys.com/qed/sqroot.html#implementations
-//
-// Note that some of the implementations on the above page break ANSI
-// C(++) aliasing rules (by assigning to the results of
-// reinterpret_cast-ing between int and float).  It's also just a
-// performance optimization and not required for a reasonable
-// implementation.
-
-}  // namespace
-
-namespace Kokkos {
-namespace Details {
-
-/// \class ArithTraits
-/// \brief Traits class for arithmetic on type T.
-/// \tparam T "Scalar" type of interest
-///
-/// This is a traits class for the "arithmetic" type T.  "Arithmetic
-/// types" include built-in signed and unsigned integer types,
-/// floating-point types, complex-valued types, and anything else that
-/// looks like these.  This class is useful for implementing numerical
-/// algorithms that are generic on the data type.  You may also use
-/// this class to query attributes of T, like whether it is signed or
-/// complex, or its precision.
-///
-/// We really did not want to implement this class or expose it to
-/// users.  It would be much better to use existing traits classes
-/// like std::numeric_limits.  We decided to implement and expose this
-/// class for the following reasons:
-/// <ol>
-/// <li> std::numeric_limits class methods cannot be used in CUDA
-///      device functions, since they themselves are not device
-///      functions </li>
-/// <li> Existing traits classes like std::numeric_limits do not
-///      provide enough information to implement algorithms that are
-///      agnostic of whether T is real-valued or complex-valued. </li>
-/// </ol>
-///
-/// All class methods must be suitable for parallel kernels, if the
-/// type T itself is suitable for parallel kernels.  In particular,
-/// specializations for types T that make sense to use on a CUDA
-/// device must mark all class methods as device (and host) functions,
-/// using the KOKKOS_FORCEINLINE_FUNCTION macro.  All class methods must be
-/// callable both inside and outside a parallel kernel (for CUDA, this
-/// means they must be marked as both device and host functions).
-///
-/// \section Kokkos_ArithTraits_compat Compatibility
-///
-/// Whenever possible, class methods in ArithTraits use the same names
-/// as their equivalents in the C++ Standard Library.  If this was not
-/// possible, for example with isInf and isNan, we explain why in
-/// their documentation.
-///
-/// This class has redundant typedefs and methods in order to maintain
-/// backwards compatibility with Teuchos::ScalarTraits, while
-/// preferring forwards (partial) compatibility with
-/// std::numeric_limits.  Users should prefer typedefs, \c bool
-/// constants, and class methods compatible with std::numeric_limits,
-/// to those from Teuchos::ScalarTraits.  The latter may go away at
-/// any time.  Furthermore, Teuchos::ScalarTraits contains methods
-/// that do not make sense for use as parallel device functions, in
-/// particular those relating to pseudorandom number generation that
-/// refer to hidden state, so we will never include all class methods
-/// from Teuchos::ScalarTraits in ArithTraits.
-///
-/// \section Kokkos_ArithTraits_unsupp Unsupported types on CUDA devices
-///
-/// CUDA does not support long double or std::complex<T> in device
-/// functions.  ArithTraits does have specializations for these types,
-/// but the class methods therein are not marked as device functions.
-///
-/// \section Kokkos_ArithTraits_whyNotC99 What about C99 integer types?
-///
-/// C99 and C++11 include typedefs int${N}_t and uint${N}_t, where N
-/// is the number of bits in the integer.  These typedefs are useful
-/// because they make the length of the type explicit.  Users are
-/// welcome to use these types as the template parameter of
-/// ArithTraits.
-///
-/// We chose not to use these types when <i>defining</i> full
-/// specializations of ArithTraits.  This is because the C99 integer
-/// types are typedefs, not types in themselves.  This makes it
-/// impossible to avoid duplicate or missing full specializations of
-/// ArithTraits.  For example, on my Mac, for CUDA 5.5, gcc 4.2.1, and
-/// Clang 3.2, <tt>int64_t</tt> is a typedef of <tt>long long</tt>,
-/// but <tt>long long</tt> and <tt>long</tt> are separate types, even
-/// though they have the same length (64 bits).  In contrast, on
-/// Windows (even Win64), <tt>long</tt> is a 32-bit type (but a
-/// distinct type from <tt>int</tt>), and <tt>long long</tt> is a
-/// 64-bit type.  Thus, if we define full specializations of
-/// ArithTraits using <i>only</i> the C99 integer types, we will be
-/// missing a specialization for <tt>long</tt> on at least one
-/// platform.
-///
-/// Rather than trouble ourselves with trying to figure this out for
-/// each platform, we decided to provide specializations only for the
-/// integer types in the C89 and C++03 language standards.  This
-/// includes signed and unsigned versions of <tt>char</tt>,
-/// <tt>short</tt>, <tt>int</tt>, and <tt>long</tt>.  We also include
-/// <tt>long long</tt> if your platform supports it.  We may thus have
-/// left out some C99 integer type, but this is only possible if the
-/// C89 / C++03 integer types do not have complete coverage of all
-/// powers of two bits from 8 up to the longest provided length (e.g.,
-/// 64 on a 64-bit system).  On all platforms I have encountered,
-/// <tt>char</tt> has 8 bits and <tt>short</tt> has 16 bits, so I am
-/// not worried about missing specializations for <tt>int16_t</tt> or
-/// <tt>uint16_t</tt>.  If you should find that either of these
-/// specializations are missing, though, please let us know.
-///
-/// Note that <tt>char</tt>, <tt>signed char</tt>, and <tt>unsigned
-/// char</tt> are distinct types, whether <tt>char</tt> is signed or
-/// unsigned.  (The language standards do not specify whether
-/// <tt>char</tt> is signed or unsigned.)  That is, <tt>char</tt> is
-/// <i>not</i> a typedef of <tt>signed char</tt> or <tt>unsigned
-/// char</tt>.  This is why we provide full specializations of
-/// ArithTraits for each of these types.  Interestingly enough, on my
-/// system, <tt>char</tt> and <tt>int8_t</tt> are different types, but
-/// <tt>signed char</tt> and <tt>int8_t</tt> are the same.
-///
-/// \section Kokkos_ArithTraits_impl Implementation notes
-///
-/// This section contains notes to developers who which to add a
-/// partial specialization of this class for a new type T.  If you
-/// decide to write a default templated implementation, it must not
-/// declare any methods as device functions.  This ensures correct
-/// behavior for arbitrary T, but does require specializations for
-/// common types like T = float and double, as well as for other types
-/// T that make sense to use on a CUDA device.
-template <class T>
-class ArithTraits {
- public:
-  /// \brief A type that acts like T and works with Kokkos.
-  ///
-  /// This is usually just an alias for T.  However, some types T do
-  /// not work well with Kokkos.  In that case, we use a mostly
-  /// equivalent type here.  For example, ArithTraits<std::complex<R>
-  /// >::val_type is Kokkos::complex<R>.
-  typedef T val_type;
-  /// \brief The type of the magnitude (absolute value) of T.
-  ///
-  /// We define this as the type returned by abs() in this class.  If
-  /// T is real (not complex), then \c val_type and \c mag_type are
-  /// usually the same.  If T is <tt>std::complex<R></tt> for some R,
-  /// then R and \c mag_type are usually the same.
-  typedef T mag_type;
-
-  //! Whether ArithTraits has a specialization for T.
-  static const bool is_specialized = false;
-  //! Whether T is a signed type (has negative values).
-  static const bool is_signed = false;
-  //! Whether T is an integer type.
-  static const bool is_integer = false;
-  /// \brief Whether T "uses exact representations."
-  ///
-  /// The opposite of is_exact is "is approximate," that is, "may
-  /// commit rounding error."
-  static const bool is_exact = false;
-  //! Whether T is a complex-valued type.
-  static const bool is_complex = false;
-
-  /// \brief Whether x is Inf.
-  ///
-  /// This can only be true for floating-point types T that support
-  /// Inf.  If T is a complex type, we say that a T instance x is Inf
-  /// if and only if <tt>isinf(real(x)) || isinf(imag(x))</tt>.
-  ///
-  /// Unfortunately we can't call this "isinf" (the equivalent C99
-  /// function), because CUDA appears to implement that function using
-  /// a macro, rather than using a function (as C++11 requires).
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const T& x);
-
-  /// \brief Whether x is NaN (not a number).
-  ///
-  /// This can only be true for floating-point types T that support
-  /// NaN.  If T is a complex type, we say that a T instance x is NaN
-  /// if and only if <tt>isNan(real(x)) || isNan(imag(x))</tt>.
-  ///
-  /// Unfortunately we can't call this "isnan" (the equivalent C99
-  /// function), because CUDA appears to implement that function using
-  /// a macro, rather than using a function (as C++11 requires).
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const T& x);
-
-  //! The absolute value (magnitude) of x.
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const T& x);
-
-  //! The zero value of T; the arithmetic identity.
-  static KOKKOS_FORCEINLINE_FUNCTION T zero();
-
-  //! The one value of T; the multiplicative identity.
-  static KOKKOS_FORCEINLINE_FUNCTION T one();
-
-  /// \brief True if this type T is capable of representing the
-  /// positive infinity as a distinct special value, as with
-  /// std::numeric_limits<T>::has_infinity.
-  static constexpr bool has_infinity = false;
-
-  /// \brief Returns the special value "positive infinity", as
-  /// represented by the floating-point type T. Only meaningful if
-  /// KokkosArithTraits<T>::has_infinity == true. Provides same
-  /// functionality as std::numeric_limits<T>::infinity().
-  ///
-  /// \note Would have liked to mark it as constexpr but then would
-  /// not be able to provide the specialization for std::complex<T>
-  /// since its constructor only becomes constexpr with C++14.
-  static KOKKOS_FORCEINLINE_FUNCTION T infinity();
-
-  /// \brief The minimum possible value of T.
-  ///
-  /// If T is a real floating-point type, then this is the minimum
-  /// <i>positive</i> value, as with std::numeric_limits<T>::min().
-  static KOKKOS_FORCEINLINE_FUNCTION T min();
-
-  //! The maximum possible value of T.
-  static KOKKOS_FORCEINLINE_FUNCTION T max();
-
-  /// \brief The real part of x.
-  ///
-  /// If \c is_complex is false, then this just returns x.
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const T& x);
-
-  /// \brief The imaginary part of x.
-  ///
-  /// If \c is_complex is false, then this just returns zero().
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const T&);
-
-  /// \brief The complex conjugate of x.
-  ///
-  /// If \c is_complex is false, then this just returns x.
-  static KOKKOS_FORCEINLINE_FUNCTION T conj(const T&);
-
-  //! x raised to the power y.
-  static KOKKOS_FORCEINLINE_FUNCTION T pow(const T& x, const T& y);
-
-  /// \brief The square root of x.
-  ///
-  /// If T is an integer type, this is the floor of the square root.
-  /// If T is a complex-valued type, then this method returns the
-  /// principal branch of the square root.
-  ///
-  /// If T is real-valued and x is negative, the result of the square
-  /// root is undefined in general.  (CUDA does not allow throwing
-  /// exceptions in device functions.)  Implementations should return
-  /// NaN if the type T supports this.  Of course, in that case, the
-  /// square of the result will not equal x.
-  static KOKKOS_FORCEINLINE_FUNCTION T sqrt(const T& x);
-
-  /// \brief The cubic root of x.
-  ///
-  /// If T is an integer type, this is the floor of the cubic root.
-  /// If T is a complex-valued type, then this method returns the
-  /// principal branch of the cubic root.
-  ///
-  /// If T is real-valued and x is negative, the result of the cubic
-  /// root is undefined in general.  (CUDA does not allow throwing
-  /// exceptions in device functions.)  Implementations should return
-  /// NaN if the type T supports this.  Of course, in that case, the
-  /// cubic of the result will not equal x.
-  static KOKKOS_FORCEINLINE_FUNCTION T cbrt(const T& x);
-
-  /// \brief The natural (base e) exponential function of x.
-  ///
-  /// If T is an integer type, this is the floor of the exponential
-  /// function.  If T is a complex-valued type, then this method
-  /// returns \f$e^{x+iy} = e^x ( cos(y) + i sin(y) )\f$.
-  ///
-  static KOKKOS_FORCEINLINE_FUNCTION T exp(const T& x);
-
-  /// \brief The natural (base e) logarithm of x.
-  ///
-  /// If T is an integer type, this is the floor of the logarithm.  If
-  /// T is a complex-valued type, then this method returns the
-  /// principal branch of the logarithm.
-  ///
-  /// If T is real-valued and x is negative, the result of the
-  /// logarithm is undefined in general.  (CUDA does not allow
-  /// throwing exceptions in device functions.)  Implementations
-  /// should return NaN if the type T supports this.  Of course, in
-  /// that case, if y is the result, \f$e^y\f$ will not equal x.
-  static KOKKOS_FORCEINLINE_FUNCTION T log(const T& x);
-
-  /// \brief The base ten logarithm of the input.
-  ///
-  /// If T is an integer type, this is the floor of the logarithm.  If
-  /// T is a complex-valued type, then this method returns the
-  /// principal branch of the logarithm.
-  ///
-  /// If T is real-valued and x is negative, the result of the
-  /// logarithm is undefined in general.  (CUDA does not allow
-  /// throwing exceptions in device functions.)  Implementations
-  /// should return NaN if the type T supports this.  Of course, in
-  /// that case, if y is the result, \f$10^y\f$ will not equal x.
-  static KOKKOS_FORCEINLINE_FUNCTION T log10(const T& x);
-
-  /// Trigonometric and hyperbolic functions are not available
-  /// for integer types. This is because asin(sin(x)) is not x
-  /// when x is integer with a rounding error.
-  ///
-  ///  KJ: log, exp also has this problem. We probably need to
-  ///      disable them for integer types instead of providing
-  ///      functionality with floor.
-
-  /// \brief The sin function of x
-  ///
-  static KOKKOS_FORCEINLINE_FUNCTION T sin(const T& x);
-
-  /// \brief The cos function of x
-  ///
-  static KOKKOS_FORCEINLINE_FUNCTION T cos(const T& x);
-
-  /// \brief The tan function of x
-  ///
-  static KOKKOS_FORCEINLINE_FUNCTION T tan(const T& x);
-
-  /// \brief The sin hyperbolic function of x
-  ///
-  static KOKKOS_FORCEINLINE_FUNCTION T sinh(const T& x);
-
-  /// \brief The cos hyperbolic function of x
-  ///
-  static KOKKOS_FORCEINLINE_FUNCTION T cosh(const T& x);
-
-  /// \brief The tan hyperbolic function of x
-  ///
-  static KOKKOS_FORCEINLINE_FUNCTION T tanh(const T& x);
-
-  /// \brief The asin function of x
-  ///
-  static KOKKOS_FORCEINLINE_FUNCTION T asin(const T& x);
-
-  /// \brief The acos function of x
-  ///
-  static KOKKOS_FORCEINLINE_FUNCTION T acos(const T& x);
-
-  /// \brief The atan function of x
-  ///
-  static KOKKOS_FORCEINLINE_FUNCTION T atan(const T& x);
-
-  /// \brief Return a silent NaN, if appropriate for T.
-  ///
-  /// If T does <i>not</i> implement a silent NaN, the return value is
-  /// undefined, but calling this method is still allowed.
-  static KOKKOS_FORCEINLINE_FUNCTION T nan();
-
-  /// \brief Machine epsilon.
-  ///
-  /// If T is an integer type (std::numeric_traits<T>::is_exact is
-  /// true), then epsilon() returns 0.  Otherwise, if T is a
-  /// floating-point type, it returns machine epsilon that T.
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon();
-
-  //@{
-  /// \name Traits defined for backwards compatibility with
-  /// Teuchos::ScalarTraits
-  ///
-  /// All of the typedefs, \c bool constants, and class methods in
-  /// this section are defined in order that one may replace most uses
-  /// of Teuchos::ScalarTraits with ArithTraits.  Users who do not
-  /// have this backwards compatibility requirement should prefer
-  /// equivalents in other sections.  Those class methods which have
-  /// the same name and meaning in both Teuchos::ScalarTraits and this
-  /// class, such as log() and pow(), are not in this section.
-
-  //! Same as mag_type; the type of the absolute value (magnitude) of T.
-  typedef T magnitudeType;
-
-  /// \brief The type with "half the precision" of T.
-  ///
-  /// This typedef only makes sense if T is a floating-point type.
-  typedef T halfPrecision;
-
-  /// \brief The type with "twice the the precision" of T.
-  ///
-  /// This typedef only makes sense if T is a floating-point type.
-  typedef T doublePrecision;
-
-  static const bool isComplex    = false;
-  static const bool isOrdinal    = false;
-  static const bool isComparable = false;
-
-  /// \brief True if this type T has floating-point parameters.
-  ///
-  /// This is true if and only if this specialization of ArithTraits
-  /// has "machine-specific" parameters eps(), sfmin(), base(),
-  /// prec(), t(), rnd(), emin(), rmin(), emax(), and rmax(), relating
-  /// to floating-point types.
-  static const bool hasMachineParameters = false;
-
-  //! Return relative machine precision.
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps();
-
-  //! Return safe minimum (sfmin), such that 1/sfmin does not overflow.
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin();
-
-  //! Return the base of the scalar type T.
-  static KOKKOS_FORCEINLINE_FUNCTION int base();
-
-  //! Return <tt>eps*base</tt>.
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type prec();
-
-  //! Returns the number of (base) digits in the significand.
-  static KOKKOS_FORCEINLINE_FUNCTION int t();
-
-  //! 1.0 when rounding occurs in addition, else 0.0.
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd();
-
-  //! Returns the minimum exponent before (gradual) underflow.
-  static KOKKOS_FORCEINLINE_FUNCTION int emin();
-
-  //! Returns the underflow threshold: <tt>base^(emin-1)</tt>
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin();
-
-  //! Returns the largest exponent before overflow.
-  static KOKKOS_FORCEINLINE_FUNCTION int emax();
-
-  //! Overflow theshold: <tt>(base^emax)*(1-eps)</tt>
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax();
-
-  //! Same as abs(); return the magnitude of x.
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const T& x);
-
-  //! Same as conj(); return the complex conjugate of x.
-  static KOKKOS_FORCEINLINE_FUNCTION T conjugate(const T& x);
-
-  /// \brief Whether x is (silent) NaN or Inf.
-  ///
-  /// This is the same as <tt>isNan(x) || isInf(x)</tt>.
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const T& x);
-
-  /// \brief The string name of T.
-  ///
-  /// Note that this is not a device function.
-  static std::string name();
-
-  //! Same as sqrt(x); the square root of x.
-  static KOKKOS_FORCEINLINE_FUNCTION T squareroot(const T& x);
-  //@}
-};
-
-// Since Kokkos::Experimental::half_t falls back to float, only define
-// ArithTraits if half_t is a backend specialization
-#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
-template <>
-class ArithTraits<Kokkos::Experimental::half_t> {
- public:
-  typedef Kokkos::Experimental::half_t val_type;
-  typedef val_type mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = true;
-  static const bool is_integer     = false;
-  static const bool is_exact       = false;
-  static const bool is_complex     = false;
-
-  static constexpr bool has_infinity = true;
-  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() {
-    return Kokkos::Experimental::cast_to_half(HUGE_VALF);
-  }
-
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) {
-#ifndef __CUDA_ARCH__
-    using std::isinf;
-#endif
-    return isinf(Kokkos::Experimental::cast_from_half<float>(x));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) {
-#ifndef __CUDA_ARCH__
-    using std::isnan;
-#endif
-    return isnan(Kokkos::Experimental::cast_from_half<float>(x));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
-    return Kokkos::Experimental::cast_to_half(
-        fabs(Kokkos::Experimental::cast_from_half<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() {
-    return Kokkos::Experimental::cast_to_half(0.0F);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one() {
-    return Kokkos::Experimental::cast_to_half(1.0F);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min() {
-    return Kokkos::Experimental::cast_to_half(-KOKKOSKERNELS_IMPL_FP16_MAX);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max() {
-    return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) {
-    return Kokkos::Experimental::cast_to_half(0.0F);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
-                                                  const val_type y) {
-    return Kokkos::Experimental::cast_to_half(
-        ::pow(Kokkos::Experimental::cast_from_half<float>(x),
-              Kokkos::Experimental::cast_from_half<float>(y)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
-    return Kokkos::Experimental::cast_to_half(
-        ::sqrt(Kokkos::Experimental::cast_from_half<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
-    return Kokkos::Experimental::cast_to_half(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::cbrt(Kokkos::Experimental::cast_from_half<float>(x))
-#else
-        ::cbrt(Kokkos::Experimental::cast_from_half<float>(x))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
-    return Kokkos::Experimental::cast_to_half(
-        ::exp(Kokkos::Experimental::cast_from_half<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
-    return Kokkos::Experimental::cast_to_half(
-        ::log(Kokkos::Experimental::cast_from_half<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
-    return Kokkos::Experimental::cast_to_half(
-        ::log10(Kokkos::Experimental::cast_from_half<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) {
-    return Kokkos::Experimental::cast_to_half(
-        ::sin(Kokkos::Experimental::cast_from_half<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) {
-    return Kokkos::Experimental::cast_to_half(
-        ::cos(Kokkos::Experimental::cast_from_half<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) {
-    return Kokkos::Experimental::cast_to_half(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::tan(Kokkos::Experimental::cast_from_half<float>(x))
-#else
-        ::tan(Kokkos::Experimental::cast_from_half<float>(x))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) {
-    return Kokkos::Experimental::cast_to_half(
-        ::sinh(Kokkos::Experimental::cast_from_half<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) {
-    return Kokkos::Experimental::cast_to_half(
-        ::cosh(Kokkos::Experimental::cast_from_half<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) {
-    return Kokkos::Experimental::cast_to_half(
-        ::tanh(Kokkos::Experimental::cast_from_half<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) {
-    return Kokkos::Experimental::cast_to_half(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::asin(Kokkos::Experimental::cast_from_half<float>(x))
-#else
-        ::asin(Kokkos::Experimental::cast_from_half<float>(x))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) {
-    return Kokkos::Experimental::cast_to_half(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::acos(Kokkos::Experimental::cast_from_half<float>(x))
-#else
-        ::acos(Kokkos::Experimental::cast_from_half<float>(x))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) {
-    return Kokkos::Experimental::cast_to_half(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::atan(Kokkos::Experimental::cast_from_half<float>(x))
-#else
-        ::atan(Kokkos::Experimental::cast_from_half<float>(x))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() {
-    // return ::pow(2, -KOKKOSKERNELS_IMPL_FP16_SIGNIFICAND_BITS);
-    return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_EPSILON);
-  }
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  // C++ doesn't have a standard "half-float" type.
-  typedef val_type halfPrecision;
-  typedef double doublePrecision;
-
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = false;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = true;
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type x) {
-    return isNan(x) || isInf(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
-    return abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
-    return conj(x);
-  }
-  static std::string name() { return "half"; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
-    return sqrt(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
-#ifdef __CUDA_ARCH__
-    return Kokkos::Experimental::cast_to_half(CUDART_NAN_F);
-#else
-    return Kokkos::Experimental::cast_to_half(
-        std::numeric_limits<float>::quiet_NaN());
-#endif  // __CUDA_ARCH__
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() {
-    return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int base() {
-    return KOKKOSKERNELS_IMPL_FP16_RADIX;
-  }
-  // Use float to allow running on both host and device
-  static KOKKOS_FORCEINLINE_FUNCTION float prec() {
-    float e = KOKKOSKERNELS_IMPL_FP16_EPSILON;
-    float b = (float)base();
-    float r = e * b;
-    return r;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int t() {
-    return KOKKOSKERNELS_IMPL_FP16_MANT_DIG;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() {
-    return Kokkos::Experimental::cast_to_half(1.0);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int emin() {
-    return KOKKOSKERNELS_IMPL_FP16_MIN_EXP;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() {
-    return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int emax() {
-    return KOKKOSKERNELS_IMPL_FP16_MAX_EXP;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() {
-    return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX);
-  }
-};
-#endif  // KOKKOS_HALF_T_IS_FLOAT && KOKKOS_ENABLE_CUDA_HALF
-
-// Since Kokkos::Experimental::bhalf_t falls back to float, only define
-// ArithTraits if bhalf_t is a backend specialization
-#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
-template <>
-class ArithTraits<Kokkos::Experimental::bhalf_t> {
- public:
-  typedef Kokkos::Experimental::bhalf_t val_type;
-  typedef val_type mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = true;
-  static const bool is_integer     = false;
-  static const bool is_exact       = false;
-  static const bool is_complex     = false;
-
-  static constexpr bool has_infinity = true;
-  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() {
-    return Kokkos::Experimental::cast_to_bhalf(HUGE_VALF);
-  }
-
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) {
-#ifndef __CUDA_ARCH__
-    using std::isinf;
-#endif
-    return isinf(Kokkos::Experimental::cast_from_bhalf<float>(x));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) {
-#ifndef __CUDA_ARCH__
-    using std::isnan;
-#endif
-    return isnan(Kokkos::Experimental::cast_from_bhalf<float>(x));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
-    return Kokkos::Experimental::cast_to_bhalf(
-        fabs(Kokkos::Experimental::cast_from_bhalf<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() {
-    return Kokkos::Experimental::cast_to_bhalf(0.0F);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one() {
-    return Kokkos::Experimental::cast_to_bhalf(1.0F);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min() {
-    return Kokkos::Experimental::cast_to_bhalf(-KOKKOSKERNELS_IMPL_BF16_MAX);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max() {
-    return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) {
-    return Kokkos::Experimental::cast_to_bhalf(0.0F);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
-                                                  const val_type y) {
-    return Kokkos::Experimental::cast_to_bhalf(
-        ::pow(Kokkos::Experimental::cast_from_bhalf<float>(x),
-              Kokkos::Experimental::cast_from_bhalf<float>(y)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
-    return Kokkos::Experimental::cast_to_bhalf(
-        ::sqrt(Kokkos::Experimental::cast_from_bhalf<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
-    return Kokkos::Experimental::cast_to_bhalf(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::cbrt(Kokkos::Experimental::cast_from_bhalf<float>(x))
-#else
-        ::cbrt(Kokkos::Experimental::cast_from_bhalf<float>(x))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
-    return Kokkos::Experimental::cast_to_bhalf(
-        ::exp(Kokkos::Experimental::cast_from_bhalf<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
-    return Kokkos::Experimental::cast_to_bhalf(
-        ::log(Kokkos::Experimental::cast_from_bhalf<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
-    return Kokkos::Experimental::cast_to_bhalf(
-        ::log10(Kokkos::Experimental::cast_from_bhalf<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) {
-    return Kokkos::Experimental::cast_to_bhalf(
-        ::sin(Kokkos::Experimental::cast_from_bhalf<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) {
-    return Kokkos::Experimental::cast_to_bhalf(
-        ::cos(Kokkos::Experimental::cast_from_bhalf<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) {
-    return Kokkos::Experimental::cast_to_bhalf(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::tan(Kokkos::Experimental::cast_from_bhalf<float>(x))
-#else
-        ::tan(Kokkos::Experimental::cast_from_bhalf<float>(x))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) {
-    return Kokkos::Experimental::cast_to_bhalf(
-        ::sinh(Kokkos::Experimental::cast_from_bhalf<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) {
-    return Kokkos::Experimental::cast_to_bhalf(
-        ::cosh(Kokkos::Experimental::cast_from_bhalf<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) {
-    return Kokkos::Experimental::cast_to_bhalf(
-        ::tanh(Kokkos::Experimental::cast_from_bhalf<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) {
-    return Kokkos::Experimental::cast_to_bhalf(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::asin(Kokkos::Experimental::cast_from_bhalf<float>(x))
-#else
-        ::asin(Kokkos::Experimental::cast_from_bhalf<float>(x))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) {
-    return Kokkos::Experimental::cast_to_bhalf(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::acos(Kokkos::Experimental::cast_from_bhalf<float>(x))
-#else
-        ::acos(Kokkos::Experimental::cast_from_bhalf<float>(x))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) {
-    return Kokkos::Experimental::cast_to_bhalf(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::atan(Kokkos::Experimental::cast_from_bhalf<float>(x))
-#else
-        ::atan(Kokkos::Experimental::cast_from_bhalf<float>(x))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() {
-    // return ::pow(2, -KOKKOSKERNELS_IMPL_BF16_SIGNIFICAND_BITS);
-    return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_EPSILON);
-  }
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  // C++ doesn't have a standard "bhalf-float" type.
-  typedef val_type bhalfPrecision;
-  typedef double doublePrecision;
-
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = false;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = true;
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type x) {
-    return isNan(x) || isInf(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
-    return abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
-    return conj(x);
-  }
-  static std::string name() { return "bhalf"; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
-    return sqrt(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
-#ifdef __CUDA_ARCH__
-    return Kokkos::Experimental::cast_to_bhalf(CUDART_NAN_F);
-#else
-    return Kokkos::Experimental::cast_to_bhalf(
-        std::numeric_limits<float>::quiet_NaN());
-#endif  // __CUDA_ARCH__
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() {
-    return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MIN);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int base() {
-    return KOKKOSKERNELS_IMPL_BF16_RADIX;
-  }
-  // Use float to allow running on both host and device
-  static KOKKOS_FORCEINLINE_FUNCTION float prec() {
-    float e = KOKKOSKERNELS_IMPL_BF16_EPSILON;
-    float b = (float)base();
-    float r = e * b;
-    return r;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int t() {
-    return KOKKOSKERNELS_IMPL_BF16_MANT_DIG;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() {
-    return Kokkos::Experimental::cast_to_bhalf(1.0);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int emin() {
-    return KOKKOSKERNELS_IMPL_BF16_MIN_EXP;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() {
-    return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MIN);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int emax() {
-    return KOKKOSKERNELS_IMPL_BF16_MAX_EXP;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() {
-    return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX);
-  }
-};
-#endif  // KOKKOS_BHALF_T_IS_FLOAT
-
-template <>
-class ArithTraits<float> {
- public:
-  typedef float val_type;
-  typedef val_type mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = true;
-  static const bool is_integer     = false;
-  static const bool is_exact       = false;
-  static const bool is_complex     = false;
-
-  static constexpr bool has_infinity = true;
-  static KOKKOS_FORCEINLINE_FUNCTION float infinity() { return HUGE_VALF; }
-
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const float x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    using std::isinf;
-#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
-    using sycl::isinf;
-#endif
-    return isinf(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const float x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    using std::isnan;
-#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
-    using sycl::isnan;
-#endif
-    return isnan(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const float x) {
-    return ::fabs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION float zero() { return 0.0; }
-  static KOKKOS_FORCEINLINE_FUNCTION float one() { return 1.0; }
-  static KOKKOS_FORCEINLINE_FUNCTION float min() { return -FLT_MAX; }
-  static KOKKOS_FORCEINLINE_FUNCTION float max() { return FLT_MAX; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const float x) { return x; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const float) { return 0.0; }
-  static KOKKOS_FORCEINLINE_FUNCTION float conj(const float x) { return x; }
-  static KOKKOS_FORCEINLINE_FUNCTION float pow(const float x, const float y) {
-    return ::pow(x, y);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION float sqrt(const float x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::sqrt(x);
-#else
-    return ::sqrt(x);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION float cbrt(const float x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::cbrt(x);
-#else
-    return ::cbrt(x);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION float exp(const float x) {
-    return ::exp(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION float log(const float x) {
-    return ::log(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION float log10(const float x) {
-    return ::log10(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION float sin(const float x) {
-    return ::sin(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION float cos(const float x) {
-    return ::cos(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION float tan(const float x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::tan(x);
-#else
-    return std::tan(x);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION float sinh(const float x) {
-    return ::sinh(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION float cosh(const float x) {
-    return ::cosh(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION float tanh(const float x) {
-    return ::tanh(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION float asin(const float x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::asin(x);
-#else
-    return ::asin(x);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION float acos(const float x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::acos(x);
-#else
-    return ::acos(x);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION float atan(const float x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::atan(x);
-#else
-    return ::atan(x);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return FLT_EPSILON; }
-
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  // C++ doesn't have a standard "half-float" type.
-  typedef float halfPrecision;
-  typedef double doublePrecision;
-
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = false;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = true;
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const float x) {
-    return isNan(x) || isInf(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const float x) {
-    return abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION float conjugate(const float x) {
-    return conj(x);
-  }
-  static std::string name() { return "float"; }
-  static KOKKOS_FORCEINLINE_FUNCTION float squareroot(const float x) {
-    return sqrt(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION float nan() {
-#if defined(__CUDA_ARCH__)
-    return CUDART_NAN_F;
-    // return nan (); //this returns 0???
-#elif defined(__HIP_DEVICE_COMPILE__)
-    return ::nanf("");
-#else
-    return std::numeric_limits<float>::quiet_NaN();
-#endif  // __CUDA_ARCH__
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() {
-    return FLT_MIN;  // ???
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int base() { return FLT_RADIX; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() {
-    return eps() * static_cast<mag_type>(base());
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int t() { return FLT_MANT_DIG; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return 1.0; }
-  static KOKKOS_FORCEINLINE_FUNCTION int emin() { return FLT_MIN_EXP; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() {
-    return FLT_MIN;  // ??? // should be base^(emin-1)
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int emax() { return FLT_MAX_EXP; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() {
-    return FLT_MAX;  // ??? // should be (base^emax)*(1-eps)
-  }
-};
-
-/// \brief Partial specialization for std::complex<RealFloatType>.
-///
-/// The C++ Standard Library (with C++03 at least) only allows
-/// std::complex<RealFloatType> for RealFloatType = float, double, or
-/// long double.
-template <class RealFloatType>
-class ArithTraits<std::complex<RealFloatType> > {
- public:
-  //! Kokkos internally replaces std::complex with Kokkos::complex.
-  typedef ::Kokkos::complex<RealFloatType> val_type;
-  typedef RealFloatType mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = true;
-  static const bool is_integer     = false;
-  static const bool is_exact       = false;
-  static const bool is_complex     = true;
-
-  static constexpr bool has_infinity = true;
-  static std::complex<RealFloatType> infinity() {
-    return std::complex<RealFloatType>(ArithTraits<mag_type>::infinity(),
-                                       ArithTraits<mag_type>::infinity());
-  }
-
-#ifdef KOKKOS_ENABLE_SYCL
-  template <typename Dummy = RealFloatType>
-  static bool isInf(const std::complex<Dummy>& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    using std::isinf;
-#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
-    using sycl::isinf;
-#endif
-    return isinf(real(x)) || isinf(imag(x));
-  }
-  template <>
-  static bool isInf<long double>(const std::complex<long double>& x) {
-    Kokkos::abort("isInf not available for std::complex<long double>!\n");
-    return true;
-  }
-#else
-  static bool isInf(const std::complex<RealFloatType>& x) {
-    return Kokkos::Experimental::isinf(real(x)) ||
-           Kokkos::Experimental::isinf(imag(x));
-  }
-#endif
-#ifdef KOKKOS_ENABLE_SYCL
-  template <typename Dummy = RealFloatType>
-  static bool isNan(const std::complex<Dummy>& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    using std::isnan;
-#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
-    using sycl::isnan;
-#endif
-    return isnan(real(x)) || isnan(imag(x));
-  }
-  template <>
-  static bool isNan<long double>(const std::complex<long double>& x) {
-    Kokkos::abort("isNan not available for std::complex<long double>!\n");
-    return true;
-  }
-#else
-  static bool isNan(const std::complex<RealFloatType>& x) {
-    return Kokkos::Experimental::isnan(real(x)) ||
-           Kokkos::Experimental::isnan(imag(x));
-  }
-#endif
-  static mag_type abs(const std::complex<RealFloatType>& x) {
-    return std::abs(x);
-  }
-  static std::complex<RealFloatType> zero() {
-    return std::complex<RealFloatType>(ArithTraits<mag_type>::zero(),
-                                       ArithTraits<mag_type>::zero());
-  }
-  static std::complex<RealFloatType> one() {
-    return std::complex<RealFloatType>(ArithTraits<mag_type>::one(),
-                                       ArithTraits<mag_type>::zero());
-  }
-  static std::complex<RealFloatType> min() {
-    return std::complex<RealFloatType>(ArithTraits<mag_type>::min(),
-                                       ArithTraits<mag_type>::zero());
-  }
-  static std::complex<RealFloatType> max() {
-    return std::complex<RealFloatType>(ArithTraits<mag_type>::max(),
-                                       ArithTraits<mag_type>::zero());
-  }
-  static mag_type real(const std::complex<RealFloatType>& x) {
-    return std::real(x);
-  }
-  static mag_type imag(const std::complex<RealFloatType>& x) {
-    return std::imag(x);
-  }
-  static std::complex<RealFloatType> conj(
-      const std::complex<RealFloatType>& x) {
-    return std::conj(x);
-  }
-  static std::complex<RealFloatType> pow(const std::complex<RealFloatType>& x,
-                                         const std::complex<RealFloatType>& y) {
-    // Fix for some weird gcc 4.2.1 inaccuracy.
-    if (y == one()) {
-      return x;
-    } else if (y == one() + one()) {
-      return x * x;
-    } else {
-      return std::pow(x, y);
-    }
-  }
-  static std::complex<RealFloatType> pow(const std::complex<RealFloatType>& x,
-                                         const RealFloatType& y) {
-    // Fix for some weird gcc 4.2.1 inaccuracy.
-    if (y == ArithTraits<RealFloatType>::one()) {
-      return x;
-    } else if (y == ArithTraits<RealFloatType>::one() +
-                        ArithTraits<RealFloatType>::one()) {
-      return x * x;
-    } else {
-      return std::pow(x, y);
-    }
-  }
-  static std::complex<RealFloatType> sqrt(
-      const std::complex<RealFloatType>& x) {
-    return std::sqrt(x);
-  }
-  static std::complex<RealFloatType> cbrt(
-      const std::complex<RealFloatType>& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::cbrt(x);
-#else
-    return ::cbrt(x);
-#endif
-  }
-  static std::complex<RealFloatType> exp(const std::complex<RealFloatType>& x) {
-    return std::exp(x);
-  }
-  static std::complex<RealFloatType> log(const std::complex<RealFloatType>& x) {
-    return std::log(x);
-  }
-  static std::complex<RealFloatType> log10(
-      const std::complex<RealFloatType>& x) {
-    return std::log10(x);
-  }
-  static std::complex<RealFloatType> sin(const std::complex<RealFloatType>& x) {
-    return std::sin(x);
-  }
-  static std::complex<RealFloatType> cos(const std::complex<RealFloatType>& x) {
-    return std::cos(x);
-  }
-  static std::complex<RealFloatType> tan(const std::complex<RealFloatType>& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::tan(x);
-#else
-    return std::tan(x);
-#endif
-  }
-  static std::complex<RealFloatType> sinh(
-      const std::complex<RealFloatType>& x) {
-    return std::sinh(x);
-  }
-  static std::complex<RealFloatType> cosh(
-      const std::complex<RealFloatType>& x) {
-    return std::cosh(x);
-  }
-  static std::complex<RealFloatType> tanh(
-      const std::complex<RealFloatType>& x) {
-    return std::tanh(x);
-  }
-  static std::complex<RealFloatType> asin(
-      const std::complex<RealFloatType>& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::asin(x);
-#else
-    return ::asin(x);
-#endif
-  }
-  static std::complex<RealFloatType> acos(
-      const std::complex<RealFloatType>& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::acos(x);
-#else
-    return ::acos(x);
-#endif
-  }
-  static std::complex<RealFloatType> atan(
-      const std::complex<RealFloatType>& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    using sycl::atan;
-#else
-    using std::atan;
-#endif
-    return atan(x);
-  }
-  static std::complex<RealFloatType> nan() {
-    const mag_type mag_nan = ArithTraits<mag_type>::nan();
-    return std::complex<RealFloatType>(mag_nan, mag_nan);
-  }
-  static mag_type epsilon() { return ArithTraits<mag_type>::epsilon(); }
-
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  typedef std::complex<typename ArithTraits<mag_type>::halfPrecision>
-      halfPrecision;
-  typedef std::complex<typename ArithTraits<mag_type>::doublePrecision>
-      doublePrecision;
-
-  static const bool isComplex            = true;
-  static const bool isOrdinal            = false;
-  static const bool isComparable         = false;
-  static const bool hasMachineParameters = true;
-  static bool isnaninf(const std::complex<RealFloatType>& x) {
-    return isNan(x) || isInf(x);
-  }
-  static mag_type magnitude(const std::complex<RealFloatType>& x) {
-    return abs(x);
-  }
-  static std::complex<RealFloatType> conjugate(
-      const std::complex<RealFloatType>& x) {
-    return conj(x);
-  }
-  static std::string name() {
-    return std::string("std::complex<") + ArithTraits<mag_type>::name() + ">";
-  }
-  static std::complex<RealFloatType> squareroot(
-      const std::complex<RealFloatType>& x) {
-    return sqrt(x);
-  }
-  static mag_type eps() { return epsilon(); }
-  static mag_type sfmin() { return ArithTraits<mag_type>::sfmin(); }
-  static int base() { return ArithTraits<mag_type>::base(); }
-  static mag_type prec() { return ArithTraits<mag_type>::prec(); }
-  static int t() { return ArithTraits<mag_type>::t(); }
-  static mag_type rnd() { return ArithTraits<mag_type>::one(); }
-  static int emin() { return ArithTraits<mag_type>::emin(); }
-  static mag_type rmin() { return ArithTraits<mag_type>::rmin(); }
-  static int emax() { return ArithTraits<mag_type>::emax(); }
-  static mag_type rmax() { return ArithTraits<mag_type>::rmax(); }
-};
-
-template <>
-class ArithTraits<double> {
- public:
-  typedef double val_type;
-  typedef val_type mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = true;
-  static const bool is_integer     = false;
-  static const bool is_exact       = false;
-  static const bool is_complex     = false;
-
-  static constexpr bool has_infinity = true;
-  static KOKKOS_FORCEINLINE_FUNCTION double infinity() { return HUGE_VAL; }
-
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    using std::isinf;
-#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
-    using sycl::isinf;
-#endif
-    return isinf(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    using std::isnan;
-#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
-    using sycl::isnan;
-#endif
-    return isnan(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
-    return ::fabs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0.0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1.0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return -DBL_MAX; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return DBL_MAX; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) {
-    return 0.0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
-                                                  const val_type y) {
-    return ::pow(x, y);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::sqrt(x);
-#else
-    return ::sqrt(x);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::cbrt(x);
-#else
-    return ::cbrt(x);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
-    return ::exp(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
-    return ::log(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
-    return ::log10(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) {
-    return ::sin(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) {
-    return ::cos(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::tan(x);
-#else
-    return std::tan(x);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) {
-    return ::sinh(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) {
-    return ::cosh(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) {
-    return ::tanh(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::asin(x);
-#else
-    return ::asin(x);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::acos(x);
-#else
-    return ::acos(x);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::atan(x);
-#else
-    return ::atan(x);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
-#if defined(__CUDA_ARCH__)
-    return CUDART_NAN;
-    // return nan (); // this returns 0 ???
-#elif defined(__HIP_DEVICE_COMPILE__)
-    return ::nan("");
-#else
-    return std::numeric_limits<val_type>::quiet_NaN();
-#endif  // __CUDA_ARCH__
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return DBL_EPSILON; }
-
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  typedef float halfPrecision;
-#if defined(__CUDA_ARCH__)
-  typedef double
-      doublePrecision;  // CUDA doesn't support long double, unfortunately
-#elif defined(__HIP_DEVICE_COMPILE__)
-  typedef double
-      doublePrecision;  // HIP does not support long double unfortunately
-#else
-  typedef long double doublePrecision;
-#endif  // __CUDA_ARCH__
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = false;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = true;
-  static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) {
-    return abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
-    return conj(x);
-  }
-  static std::string name() { return "double"; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
-    return sqrt(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() {
-    return DBL_MIN;  // ???
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int base() {
-    return FLT_RADIX;  // same for float as for double
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() {
-    return eps() * static_cast<mag_type>(base());
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int t() { return DBL_MANT_DIG; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return 1.0; }
-  static KOKKOS_FORCEINLINE_FUNCTION int emin() { return DBL_MIN_EXP; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() {
-    return DBL_MIN;  // ??? // should be base^(emin-1)
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int emax() { return DBL_MAX_EXP; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() {
-    return DBL_MAX;  // ??? // should be (base^emax)*(1-eps)
-  }
-};
-
-// CUDA and HIP do not support long double in device functions,
-// so none of the class methods in this specialization are marked
-// as device functions.
-template <>
-class ArithTraits<long double> {
- public:
-  typedef long double val_type;
-  typedef long double mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = true;
-  static const bool is_integer     = false;
-  static const bool is_exact       = false;
-  static const bool is_complex     = false;
-
-  static constexpr bool has_infinity = true;
-  static long double infinity() { return HUGE_VALL; }
-
-  static bool isInf(const val_type& x) {
-    using std::isinf;
-    return isinf(x);
-  }
-  static bool isNan(const val_type& x) {
-    using std::isnan;
-    return isnan(x);
-  }
-  static mag_type abs(const val_type& x) { return ::fabsl(x); }
-  static val_type zero() { return 0.0; }
-  static val_type one() { return 1.0; }
-  static val_type min() { return -LDBL_MAX; }
-  static val_type max() { return LDBL_MAX; }
-  static mag_type real(const val_type& x) { return x; }
-  static mag_type imag(const val_type&) { return zero(); }
-  static val_type conj(const val_type& x) { return x; }
-  static val_type pow(const val_type& x, const val_type& y) {
-    return ::pow(x, y);
-  }
-  static val_type sqrt(const val_type& x) { return ::sqrt(x); }
-  static val_type cbrt(const val_type& x) { return ::cbrtl(x); }
-  static val_type exp(const val_type& x) { return ::exp(x); }
-  static val_type log(const val_type& x) { return ::log(x); }
-  static val_type log10(const val_type& x) { return ::log10(x); }
-  static val_type sin(const val_type& x) { return ::sin(x); }
-  static val_type cos(const val_type& x) { return ::cos(x); }
-  static val_type tan(const val_type& x) { return ::tan(x); }
-  static val_type sinh(const val_type& x) { return ::sinh(x); }
-  static val_type cosh(const val_type& x) { return ::cosh(x); }
-  static val_type tanh(const val_type& x) { return ::tanh(x); }
-  static val_type asin(const val_type& x) { return ::asin(x); }
-  static val_type acos(const val_type& x) { return ::acos(x); }
-  static val_type atan(const val_type& x) { return ::atan(x); }
-  static val_type nan() { return std::numeric_limits<val_type>::quiet_NaN(); }
-  static mag_type epsilon() { return LDBL_EPSILON; }
-
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  typedef double halfPrecision;
-  // It might be appropriate to use QD's qd_real here.
-  // For now, long double is the most you get.
-  typedef val_type doublePrecision;
-
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = false;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = true;
-  static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); }
-  static mag_type magnitude(const val_type& x) { return abs(x); }
-  static val_type conjugate(const val_type& x) { return conj(x); }
-  static std::string name() { return "long double"; }
-  static val_type squareroot(const val_type& x) { return sqrt(x); }
-  static mag_type eps() { return epsilon(); }
-  static mag_type sfmin() {
-    return LDBL_MIN;  // ???
-  }
-  static int base() {
-    return FLT_RADIX;  // same for float as for double or long double
-  }
-  static mag_type prec() { return eps() * static_cast<mag_type>(base()); }
-  static int t() { return LDBL_MANT_DIG; }
-  static mag_type rnd() { return one(); }
-  static int emin() { return LDBL_MIN_EXP; }
-  static mag_type rmin() { return LDBL_MIN; }
-  static int emax() { return LDBL_MAX_EXP; }
-  static mag_type rmax() { return LDBL_MAX; }
-};  // long double specialization
-
-#ifdef HAVE_KOKKOSKERNELS_QUADMATH
-
-// CUDA does not support __float128 in device functions, so none of
-// the class methods in this specialization are marked as device
-// functions.
-template <>
-class ArithTraits<__float128> {
- public:
-  typedef __float128 val_type;
-  typedef val_type mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = true;
-  static const bool is_integer     = false;
-  static const bool is_exact       = false;
-  static const bool is_complex     = false;
-
-  static constexpr bool has_infinity = true;
-  static __float128 infinity() { return 1.0q / 0.0q; }
-
-  static bool isInf(const __float128 x) { return isinfq(x); }
-  static bool isNan(const __float128 x) { return isnanq(x); }
-  static mag_type abs(const __float128 x) { return fabsq(x); }
-  static __float128 zero() { return 0.0; }
-  static __float128 one() { return 1.0; }
-  static __float128 min() { return FLT128_MIN; }
-  static __float128 max() { return FLT128_MAX; }
-  static mag_type real(const __float128 x) { return x; }
-  static mag_type imag(const __float128 /* x */) { return 0.0; }
-  static __float128 conj(const __float128 x) { return x; }
-  static __float128 pow(const __float128 x, const __float128 y) {
-    return powq(x, y);
-  }
-  static __float128 sqrt(const __float128 x) { return sqrtq(x); }
-  static __float128 cbrt(const __float128 x) { return cbrtq(x); }
-  static __float128 exp(const __float128 x) { return exp(x); }
-  static __float128 log(const __float128 x) { return logq(x); }
-  static __float128 log10(const __float128 x) { return log10q(x); }
-  static __float128 sin(const __float128 x) { return sinq(x); }
-  static __float128 cos(const __float128 x) { return cosq(x); }
-  static __float128 tan(const __float128 x) { return tanq(x); }
-  static __float128 sinh(const __float128 x) { return sinhq(x); }
-  static __float128 cosh(const __float128 x) { return coshq(x); }
-  static __float128 tanh(const __float128 x) { return tanhq(x); }
-  static __float128 asin(const __float128 x) { return asinq(x); }
-  static __float128 acos(const __float128 x) { return acosq(x); }
-  static __float128 atan(const __float128 x) { return atanq(x); }
-  static mag_type epsilon() { return FLT128_EPSILON; }
-
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  typedef double halfPrecision;
-  // Unfortunately, we can't rely on a standard __float256 type.
-  typedef __float128 doublePrecision;
-
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = false;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = true;
-  static bool isnaninf(const __float128 x) { return isNan(x) || isInf(x); }
-  static magnitudeType magnitude(const __float128 x) { return abs(x); }
-  static __float128 conjugate(const __float128 x) { return conj(x); }
-  static std::string name() { return "__float128"; }
-  static __float128 squareroot(const __float128 x) { return sqrt(x); }
-  static __float128 nan() {
-    return strtoflt128("NAN()", NULL);  // ???
-  }
-  static mag_type eps() { return epsilon(); }
-  static mag_type sfmin() {
-    return FLT128_MIN;  // ???
-  }
-  static int base() { return 2; }
-  static mag_type prec() { return eps() * static_cast<mag_type>(base()); }
-  static int t() { return FLT_MANT_DIG; }
-  static mag_type rnd() { return 1.0; }
-  static int emin() { return FLT128_MIN_EXP; }
-  static mag_type rmin() {
-    return FLT128_MIN;  // ??? // should be base^(emin-1)
-  }
-  static int emax() { return FLT128_MAX_EXP; }
-  static mag_type rmax() {
-    return FLT128_MAX;  // ??? // should be (base^emax)*(1-eps)
-  }
-};
-#endif  // HAVE_KOKKOSKERNELS_QUADMATH
-
-template <>
-class ArithTraits< ::Kokkos::complex<float> > {
- public:
-  typedef ::Kokkos::complex<float> val_type;
-  typedef float mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = true;
-  static const bool is_integer     = false;
-  static const bool is_exact       = false;
-  static const bool is_complex     = true;
-
-  static constexpr bool has_infinity = true;
-  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() {
-    return val_type(ArithTraits<mag_type>::infinity(),
-                    ArithTraits<mag_type>::infinity());
-  }
-
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) {
-    return ArithTraits<mag_type>::isInf(x.real()) ||
-           ArithTraits<mag_type>::isInf(x.imag());
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) {
-    return ArithTraits<mag_type>::isNan(x.real()) ||
-           ArithTraits<mag_type>::isNan(x.imag());
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
-    return std::sqrt(::Kokkos::real(x) * ::Kokkos::real(x) +
-                     ::Kokkos::imag(x) * ::Kokkos::imag(x));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() {
-    return val_type(ArithTraits<mag_type>::zero(),
-                    ArithTraits<mag_type>::zero());
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one() {
-    return val_type(ArithTraits<mag_type>::one(),
-                    ArithTraits<mag_type>::zero());
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min() {
-    return val_type(ArithTraits<mag_type>::min(),
-                    ArithTraits<mag_type>::min());  // ???
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max() {
-    return val_type(ArithTraits<mag_type>::max(),
-                    ArithTraits<mag_type>::max());  // ???
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
-    return x.real();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type x) {
-    return x.imag();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
-    return ::Kokkos::conj(x);
-  }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const
-  // val_type y) {
-  //   const mag_type abs_x_square = x.real()*x.real() + x.imag()*x.imag();
-  //   const mag_type arg_x = ArithTraits<mag_type>::atan(x.imag()/x.real());
-  //   const mag_type half = mag_type(0.5);
-  //   const mag_type alpha = (ArithTraits<mag_type>::pow(abs_x_square,
-  //   half*y.real()) *
-  //                           ArithTraits<mag_type>::exp(-y.imag()*arg_x));
-  //   return val_type(alpha* ArithTraits<mag_type>::cos(y.real()*arg_x +
-  //   half*y.imag()*ArithTraits<mag_type>::log(abs_x_square)),
-  //                   alpha* ArithTraits<mag_type>::sin(y.real()*arg_x +
-  //                   half*y.imag()*ArithTraits<mag_type>::log(abs_x_square)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const
-  // mag_type y) {
-  //   const mag_type arg_x = ArithTraits<mag_type>::atan(x.imag()/x.real());
-  //   const mag_type alpha = ArithTraits<mag_type>::pow(abs(x),y);
-  //   return val_type(alpha* ArithTraits<mag_type>::cos(y*arg_x),
-  //                   alpha* ArithTraits<mag_type>::sin(y*arg_x));
-  // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
-    return ::Kokkos::sqrt(x);
-  }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
-  //   const mag_type r = ::Kokkos::abs(x);
-  //   const mag_type phi = ::atan(x.imag()/x.real())/mag_type(3);
-  //   const mag_type re = r* ::cos(phi);
-  //   const mag_type im = r* ::sin(phi);
-  //   return val_type(re,im);
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
-  //   const mag_type xx = ::exp(x.real());
-  //   const mag_type re = xx* ::cos(x.imag());
-  //   const mag_type im = xx* ::sin(x.imag());
-  //   return val_type(re,im);
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
-  //   return val_type(ArithTraits<mag_type>::log(abs(x)),
-  //   ArithTraits<mag_type>::atan(x.imag()/x.real()));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) {
-  //   return log(x)/ArithTraits<mag_type>::log(mag_type(10));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
-  //   const val_type ii = val_type(0, 1);
-  //   const val_type xx = exp(-ii*x) - exp(ii*x);
-  //   const mag_type half = 0.5;
-  //   return val_type(-half*xx.imag(),half*xx.real());
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) {
-  //   const val_type xx = exp(x) - exp(-x);
-  //   const mag_type half = 0.5;
-  //   return val_type(half*xx.real(),half*xx.imag());
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) {
-  //   const val_type ii(0, 1);
-  //   const val_type e_nix = exp(-ii*x);
-  //   const val_type e_pix = exp( ii*x);
-  //   return ii*(e_nix - e_pix)/(e_nix + e_pix);
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) {
-  //   const val_type xx = exp(x) - exp(-x);
-  //   const mag_type half = 0.5;
-  //   return val_type(half*xx.real(), half*xx.imag());
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) {
-  //   const val_type xx = exp(x) + exp(-x);
-  //   const mag_type half = 0.5;
-  //   return val_type(half*xx.real(), half*xx.imag());
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) {
-  //   const val_type e_2x = exp(2*x);
-  //   return (e_2x - 1)/(e_2x + 1);
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) {
-  //   const val_type ii(0, 1);
-  //   const val_type xx = -ii*log(ii*x + sqrt(val_type(1) - x*x));
-  //   return val_type(xx.imag(),-xx.real());
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
-  //   const val_type ii = val_type(0, 1);
-  //   const val_type xx = -ii*log(x + ii*sqrt(val_type(1) - x*x));
-  //   return val_type(xx.imag(),-xx.real());
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
-  //   val_type r_val;
-  //   const val_type ii = val_type(0, 1);
-  //   if (x == ii) {
-  //     r_val = val_type(ArithTraits<mag_type>::nan(),
-  //     std::numeric_limits<mag_type>::infinity());
-  //   } if (x == -ii) {
-  //     r_val = val_type(ArithTraits<mag_type>::nan(),
-  //     -std::numeric_limits<mag_type>::infinity());
-  //   } else {
-  //     const val_type ii_x = ii*x;
-  //     const mag_type half = 0.5;
-  //     const val_type xx = log(val_type(1) - ii_x) - log(val_type(1) + ii_x);
-  //     r_val = val_type(-half*xx.imag(), half*xx.real());
-  //   }
-  //   return r_val;
-  // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
-    // ???
-    return val_type(ArithTraits<mag_type>::nan(), ArithTraits<mag_type>::nan());
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() {
-    return ArithTraits<mag_type>::epsilon();  // ???
-  }
-
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  typedef ::Kokkos::complex<ArithTraits<mag_type>::halfPrecision> halfPrecision;
-  typedef ::Kokkos::complex<ArithTraits<mag_type>::doublePrecision>
-      doublePrecision;
-
-  static const bool isComplex    = true;
-  static const bool isOrdinal    = false;
-  static const bool isComparable = false;
-  static const bool hasMachineParameters =
-      ArithTraits<mag_type>::hasMachineParameters;
-  static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) {
-    return abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
-    return conj(x);
-  }
-  static std::string name() { return "Kokkos::complex<float>"; }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) {
-  //   return sqrt (x);
-  // }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() {
-    return ArithTraits<mag_type>::sfmin();  // ???
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int base() {
-    return ArithTraits<mag_type>::base();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() {
-    return ArithTraits<mag_type>::prec();  // ???
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int t() {
-    return ArithTraits<mag_type>::t();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() {
-    return ArithTraits<mag_type>::rnd();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int emin() {
-    return ArithTraits<mag_type>::emin();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() {
-    return ArithTraits<mag_type>::rmin();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int emax() {
-    return ArithTraits<mag_type>::emax();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() {
-    return ArithTraits<mag_type>::rmax();
-  }
-};
-
-template <>
-class ArithTraits< ::Kokkos::complex<double> > {
- public:
-  typedef ::Kokkos::complex<double> val_type;
-  typedef double mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = true;
-  static const bool is_integer     = false;
-  static const bool is_exact       = false;
-  static const bool is_complex     = true;
-
-  static constexpr bool has_infinity = true;
-  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() {
-    return val_type(ArithTraits<mag_type>::infinity(),
-                    ArithTraits<mag_type>::infinity());
-  }
-
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) {
-    return ArithTraits<mag_type>::isInf(x.real()) ||
-           ArithTraits<mag_type>::isInf(x.imag());
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) {
-    return ArithTraits<mag_type>::isNan(x.real()) ||
-           ArithTraits<mag_type>::isNan(x.imag());
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
-    return ::Kokkos::abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() {
-    return val_type(ArithTraits<mag_type>::zero(),
-                    ArithTraits<mag_type>::zero());
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one() {
-    return val_type(ArithTraits<mag_type>::one(),
-                    ArithTraits<mag_type>::zero());
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min() {
-    return val_type(ArithTraits<mag_type>::min(),
-                    ArithTraits<mag_type>::min());  // ???
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max() {
-    return val_type(ArithTraits<mag_type>::max(),
-                    ArithTraits<mag_type>::max());  // ???
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
-    return x.real();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type x) {
-    return x.imag();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
-    return ::Kokkos::conj(x);
-  }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const
-  // val_type y) {
-  //   const mag_type abs_x_square = x.real()*x.real() + x.imag()*x.imag();
-  //   const mag_type arg_x = ArithTraits<mag_type>::atan(x.imag()/x.real());
-  //   const mag_type half = mag_type(0.5);
-  //   const mag_type alpha = (ArithTraits<mag_type>::pow(abs_x_square,
-  //   half*y.real()) *
-  //                           ArithTraits<mag_type>::exp(-y.imag()*arg_x));
-  //   return val_type(alpha* ArithTraits<mag_type>::cos(y.real()*arg_x +
-  //   half*y.imag()*ArithTraits<mag_type>::log(abs_x_square)),
-  //                   alpha* ArithTraits<mag_type>::sin(y.real()*arg_x +
-  //                   half*y.imag()*ArithTraits<mag_type>::log(abs_x_square)));
-
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const
-  // mag_type y) {
-  //   const mag_type arg_x = ArithTraits<mag_type>::atan(x.imag()/x.real());
-  //   const mag_type alpha = ArithTraits<mag_type>::pow(abs(x),y);
-  //   return val_type(alpha* ArithTraits<mag_type>::cos(y*arg_x),
-  //                   alpha* ArithTraits<mag_type>::sin(y*arg_x));
-  // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
-    return ::Kokkos::sqrt(x);
-  }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
-  //   const mag_type r = ::Kokkos::abs(x);
-  //   const mag_type phi = ::atan(x.imag()/x.real())/mag_type(3);
-  //   const mag_type re = r* ::cos(phi);
-  //   const mag_type im = r* ::sin(phi);
-  //   return val_type(re,im);
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
-  //   const mag_type xx = ::exp(x.real());
-  //   const mag_type re = xx* ::cos(x.imag());
-  //   const mag_type im = xx* ::sin(x.imag());
-  //   return val_type(re,im);
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
-  //   return val_type(ArithTraits<mag_type>::log(abs(x)),
-  //   ArithTraits<mag_type>::atan(x.imag()/x.real()));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) {
-  //   return log(x)/ArithTraits<mag_type>::log(mag_type(10));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
-  //   const val_type ii = val_type(0, 1);
-  //   const val_type xx = exp(-ii*x) - exp(ii*x);
-  //   const mag_type half = 0.5;
-  //   return val_type(-half*xx.imag(),half*xx.real());
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) {
-  //   const val_type xx = exp(x) - exp(-x);
-  //   const mag_type half = 0.5;
-  //   return val_type(half*xx.real(),half*xx.imag());
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) {
-  //   const val_type ii = val_type(0, 1);
-  //   const val_type e_nix = exp(-ii*x);
-  //   const val_type e_pix = exp( ii*x);
-  //   return ii*(e_nix - e_pix)/(e_nix + e_pix);
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) {
-  //   const val_type xx = exp(x) - exp(-x);
-  //   const mag_type half = 0.5;
-  //   return val_type(half*xx.real(), half*xx.imag());
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) {
-  //   const val_type xx = exp(x) + exp(-x);
-  //   const mag_type half = 0.5;
-  //   return val_type(half*xx.real(), half*xx.imag());
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) {
-  //   const val_type e_2x = exp(2*x);
-  //   return (e_2x - 1)/(e_2x + 1);
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) {
-  //   const val_type ii(0, 1);
-  //   const val_type xx = -ii*log(ii*x + sqrt(val_type(1) - x*x));
-  //   return val_type(xx.imag(),-xx.real());
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
-  //   const val_type ii = val_type(0, 1);
-  //   const val_type xx = -ii*log(x + ii*sqrt(val_type(1) - x*x));
-  //   return val_type(xx.imag(),-xx.real());
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
-  //   val_type r_val;
-  //   const val_type ii = val_type(0, 1);
-  //   if (x == ii) {
-  //     r_val = val_type(ArithTraits<mag_type>::nan(),
-  //     std::numeric_limits<mag_type>::infinity());
-  //   } if (x == -ii) {
-  //     r_val = val_type(ArithTraits<mag_type>::nan(),
-  //     -std::numeric_limits<mag_type>::infinity());
-  //   } else {
-  //     const val_type ii_x = ii*x;
-  //     const mag_type half = 0.5;
-  //     const val_type xx = log(val_type(1) - ii_x) - log(val_type(1) + ii_x);
-  //     r_val = val_type(-half*xx.imag(), half*xx.real());
-  //   }
-  //   return r_val;
-  // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
-    // ???
-    return val_type(ArithTraits<mag_type>::nan(), ArithTraits<mag_type>::nan());
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() {
-    return ArithTraits<mag_type>::epsilon();  // ???
-  }
-
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  typedef ::Kokkos::complex<ArithTraits<mag_type>::halfPrecision> halfPrecision;
-  typedef ::Kokkos::complex<ArithTraits<mag_type>::doublePrecision>
-      doublePrecision;
-
-  static const bool isComplex    = true;
-  static const bool isOrdinal    = false;
-  static const bool isComparable = false;
-  static const bool hasMachineParameters =
-      ArithTraits<mag_type>::hasMachineParameters;
-  static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) {
-    return abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
-    return conj(x);
-  }
-  static std::string name() { return "Kokkos::complex<double>"; }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) {
-  //   return sqrt (x);
-  // }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() {
-    return ArithTraits<mag_type>::sfmin();  // ???
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int base() {
-    return ArithTraits<mag_type>::base();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() {
-    return ArithTraits<mag_type>::prec();  // ???
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int t() {
-    return ArithTraits<mag_type>::t();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() {
-    return ArithTraits<mag_type>::rnd();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int emin() {
-    return ArithTraits<mag_type>::emin();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() {
-    return ArithTraits<mag_type>::rmin();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int emax() {
-    return ArithTraits<mag_type>::emax();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() {
-    return ArithTraits<mag_type>::rmax();
-  }
-};
-
-template <>
-class ArithTraits<char> {
- public:
-  typedef char val_type;
-  typedef val_type mag_type;
-
-  static const bool is_specialized = true;
-  // The C(++) standard does not require that char be signed.  In
-  // fact, signed char, unsigned char, and char are distinct types.
-  // We can use std::numeric_limits here because it's a const bool,
-  // not a class method.
-  static const bool is_signed  = std::numeric_limits<char>::is_signed;
-  static const bool is_integer = true;
-  static const bool is_exact   = true;
-  static const bool is_complex = false;
-
-  static constexpr bool has_infinity = false;
-  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
-
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
-    // This avoids warnings based on whether char is signed or unsigned
-    return integer_abs<char>::abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return CHAR_MIN; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return CHAR_MAX; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
-                                                  const val_type y) {
-    if (is_signed) {
-      return intPowSigned<val_type>(x, y);
-    } else {
-      return intPowUnsigned<val_type>(x, y);
-    }
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
-    // C++11 defines std::sqrt for integer arguments.  However, we
-    // currently can't assume C++11.
-    //
-    // This cast will result in no loss of accuracy, though it might
-    // be more expensive than it should, if we were clever about using
-    // bit operations.
-    //
-    // We take the absolute value first to avoid negative arguments.
-    // Negative real arguments to sqrt(float) return (float) NaN, but
-    // built-in integer types do not have an equivalent to NaN.
-    // Casting NaN to an integer type will thus result in some integer
-    // value which appears valid, but is not.  We cannot raise an
-    // exception in device functions.  Thus, we prefer to take the
-    // absolute value of x first, to avoid issues.  Another
-    // possibility would be to test for a NaN output and convert it to
-    // some reasonable value (like 0), though this might be more
-    // expensive than the absolute value interpreted using the ternary
-    // operator.
-    return static_cast<val_type>(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::sqrt(static_cast<float>(abs(x)))
-#else
-        ::sqrt(static_cast<float>(abs(x)))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
-    return static_cast<val_type>(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::cbrt(static_cast<float>(abs(x)))
-#else
-        ::cbrt(static_cast<float>(abs(x)))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
-    return static_cast<val_type>(::exp(static_cast<float>(abs(x))));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
-    return static_cast<val_type>(::log(static_cast<float>(abs(x))));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
-    return static_cast<val_type>(::log10(static_cast<float>(abs(x))));
-  }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
-  //   return static_cast<val_type> ( ::sin (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) {
-  //   return static_cast<val_type> ( ::cos (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) {
-  //   return static_cast<val_type> ( ::tan (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) {
-  //   return static_cast<val_type> ( ::sinh (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) {
-  //   return static_cast<val_type> ( ::cosh (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) {
-  //   return static_cast<val_type> ( ::tanh (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) {
-  //   return static_cast<val_type> ( ::asin (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
-  //   return static_cast<val_type> ( ::acos (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
-  //   return static_cast<val_type> ( ::atan (static_cast<float> (x)));
-  // }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
-
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  typedef val_type halfPrecision;
-  typedef val_type doublePrecision;
-
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = true;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
-    return abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
-    return conj(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
-    return false;
-  }
-  static std::string name() { return "char"; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
-    return sqrt(x);
-  }
-};
-
-template <>
-class ArithTraits<signed char> {
- public:
-  typedef signed char val_type;
-  typedef val_type mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = true;
-  static const bool is_integer     = true;
-  static const bool is_exact       = true;
-  static const bool is_complex     = false;
-
-  static constexpr bool has_infinity = false;
-  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
-
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
-    return x >= 0 ? x : -x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return SCHAR_MIN; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return SCHAR_MAX; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
-                                                  const val_type y) {
-    return intPowSigned<val_type>(x, y);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
-    return static_cast<val_type>(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::sqrt(static_cast<float>(abs(x)))
-#else
-        ::sqrt(static_cast<float>(abs(x)))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
-    return static_cast<val_type>(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::cbrt(static_cast<float>(abs(x)))
-#else
-        ::cbrt(static_cast<float>(abs(x)))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
-    return static_cast<val_type>(::exp(static_cast<float>(abs(x))));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
-    return static_cast<val_type>(::log(static_cast<float>(abs(x))));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
-    return static_cast<val_type>(::log10(static_cast<float>(abs(x))));
-  }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
-  //   return static_cast<val_type> ( ::sin (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) {
-  //   return static_cast<val_type> ( ::cos (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) {
-  //   return static_cast<val_type> ( ::tan (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) {
-  //   return static_cast<val_type> ( ::sinh (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) {
-  //   return static_cast<val_type> ( ::cosh (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) {
-  //   return static_cast<val_type> ( ::tanh (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) {
-  //   return static_cast<val_type> ( ::asin (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
-  //   return static_cast<val_type> ( ::acos (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
-  //   return static_cast<val_type> ( ::atan (static_cast<float> (x)));
-  // }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
-
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  typedef val_type halfPrecision;
-  typedef val_type doublePrecision;
-
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = true;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
-    return abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
-    return conj(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
-    return false;
-  }
-  static std::string name() { return "signed char"; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
-    return sqrt(x);
-  }
-};
-
-template <>
-class ArithTraits<unsigned char> {
- public:
-  typedef unsigned char val_type;
-  typedef val_type mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = false;
-  static const bool is_integer     = true;
-  static const bool is_exact       = true;
-  static const bool is_complex     = false;
-
-  static constexpr bool has_infinity = false;
-  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
-
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
-    return x;  // it's unsigned, so it's positive
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return UCHAR_MAX; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
-                                                  const val_type y) {
-    return intPowUnsigned<val_type>(x, y);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
-    // This will result in no loss of accuracy, though it might be
-    // more expensive than it should, if we were clever about using
-    // bit operations.
-    return static_cast<val_type>(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::sqrt(static_cast<float>(abs(x)))
-#else
-        ::sqrt(static_cast<float>(abs(x)))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
-    return static_cast<val_type>(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::cbrt(static_cast<float>(abs(x)))
-#else
-        ::cbrt(static_cast<float>(abs(x)))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
-    return static_cast<val_type>(::exp(static_cast<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
-    return static_cast<val_type>(::log(static_cast<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
-    return static_cast<val_type>(::log10(static_cast<float>(x)));
-  }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
-  //   return static_cast<val_type> ( ::sin (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) {
-  //   return static_cast<val_type> ( ::cos (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) {
-  //   return static_cast<val_type> ( ::tan (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) {
-  //   return static_cast<val_type> ( ::sinh (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) {
-  //   return static_cast<val_type> ( ::cosh (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) {
-  //   return static_cast<val_type> ( ::tanh (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) {
-  //   return static_cast<val_type> ( ::asin (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
-  //   return static_cast<val_type> ( ::acos (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
-  //   return static_cast<val_type> ( ::atan (static_cast<float> (x)));
-  // }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
-
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  typedef val_type halfPrecision;
-  typedef val_type doublePrecision;
-
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = true;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
-    return abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
-    return conj(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
-    return false;
-  }
-  static std::string name() { return "unsigned char"; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
-    return sqrt(x);
-  }
-};
-
-template <>
-class ArithTraits<short> {
- public:
-  typedef short val_type;
-  typedef val_type mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = true;
-  static const bool is_integer     = true;
-  static const bool is_exact       = true;
-  static const bool is_complex     = false;
-
-  static constexpr bool has_infinity = false;
-  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
-
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
-    // std::abs appears to work with CUDA 5.5 at least, but I'll use
-    // the ternary expression for maximum generality.  Note that this
-    // expression does not necessarily obey the rules for fabs() with
-    // NaN input, so it should not be used for floating-point types.
-    // It's perfectly fine for signed integer types, though.
-    return x >= 0 ? x : -x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min() {
-    // Macros like this work with CUDA, but
-    // std::numeric_limits<val_type>::min() does not, because it is
-    // not marked as a __device__ function.
-    return SHRT_MIN;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return SHRT_MAX; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
-                                                  const val_type y) {
-    return intPowSigned<val_type>(x, y);
-  }
-  //! Integer square root returns a lower bound.
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
-    // This will result in no loss of accuracy, though it might be
-    // more expensive than it should, if we were clever about using
-    // bit operations.
-    return static_cast<val_type>(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::sqrt(static_cast<float>(abs(x)))
-#else
-        ::sqrt(static_cast<float>(abs(x)))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
-    return static_cast<val_type>(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::cbrt(static_cast<float>(abs(x)))
-#else
-        ::cbrt(static_cast<float>(abs(x)))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
-    return static_cast<val_type>(::exp(static_cast<float>(abs(x))));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
-    return static_cast<val_type>(::log(static_cast<float>(abs(x))));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
-    return static_cast<val_type>(::log10(static_cast<float>(abs(x))));
-  }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
-  //   return static_cast<val_type> ( ::sin (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) {
-  //   return static_cast<val_type> ( ::cos (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) {
-  //   return static_cast<val_type> ( ::tan (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) {
-  //   return static_cast<val_type> ( ::sinh (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) {
-  //   return static_cast<val_type> ( ::cosh (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) {
-  //   return static_cast<val_type> ( ::tanh (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) {
-  //   return static_cast<val_type> ( ::asin (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
-  //   return static_cast<val_type> ( ::acos (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
-  //   return static_cast<val_type> ( ::atan (static_cast<float> (x)));
-  // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
-    // short doesn't implement a NaN value, but we can still have it
-    // return some "flag" value that can help users find use of
-    // uninitialized data.
-    return static_cast<val_type>(-1);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
-
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  typedef val_type halfPrecision;
-  typedef val_type doublePrecision;
-
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = true;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
-    return abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
-    return conj(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
-    return false;
-  }
-  static std::string name() { return "short"; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
-    return sqrt(x);
-  }
-};
-
-template <>
-class ArithTraits<unsigned short> {
- public:
-  typedef unsigned short val_type;
-  typedef val_type mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = false;
-  static const bool is_integer     = true;
-  static const bool is_exact       = true;
-  static const bool is_complex     = false;
-
-  static constexpr bool has_infinity = false;
-  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
-
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
-    return x;  // it's unsigned, so it's positive
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return USHRT_MAX; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
-                                                  const val_type y) {
-    return intPowUnsigned<val_type>(x, y);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
-    // This will result in no loss of accuracy, though it might be
-    // more expensive than it should, if we were clever about using
-    // bit operations.
-    return static_cast<val_type>(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::sqrt(static_cast<float>(abs(x)))
-#else
-        ::sqrt(static_cast<float>(abs(x)))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
-    return static_cast<val_type>(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::cbrt(static_cast<float>(abs(x)))
-#else
-        ::cbrt(static_cast<float>(abs(x)))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
-    return static_cast<val_type>(::exp(static_cast<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
-    return static_cast<val_type>(::log(static_cast<float>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
-    return static_cast<val_type>(::log10(static_cast<float>(x)));
-  }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
-  //   return static_cast<val_type> ( ::sin (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) {
-  //   return static_cast<val_type> ( ::cos (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) {
-  //   return static_cast<val_type> ( ::tan (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) {
-  //   return static_cast<val_type> ( ::sinh (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) {
-  //   return static_cast<val_type> ( ::cosh (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) {
-  //   return static_cast<val_type> ( ::tanh (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) {
-  //   return static_cast<val_type> ( ::asin (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
-  //   return static_cast<val_type> ( ::acos (static_cast<float> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
-  //   return static_cast<val_type> ( ::atan (static_cast<float> (x)));
-  // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
-    // unsigned short doesn't implement a NaN value, but we can still
-    // have it return some "flag" value that can help users find use
-    // of uninitialized data.
-    return max();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
-
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  typedef val_type halfPrecision;
-  typedef val_type doublePrecision;
-
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = true;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
-    return abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
-    return conj(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
-    return false;
-  }
-  static std::string name() { return "unsigned short"; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
-    return sqrt(x);
-  }
-};
-
-template <>
-class ArithTraits<int> {
- public:
-  typedef int val_type;
-  typedef val_type mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = true;
-  static const bool is_integer     = true;
-  static const bool is_exact       = true;
-  static const bool is_complex     = false;
-
-  static constexpr bool has_infinity = false;
-  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
-
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
-    // std::abs appears to work with CUDA 5.5 at least, but I'll use
-    // the ternary expression for maximum generality.  Note that this
-    // expression does not necessarily obey the rules for fabs() with
-    // NaN input, so it should not be used for floating-point types.
-    // It's perfectly fine for signed integer types, though.
-    return x >= 0 ? x : -x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min() {
-    // Macros like INT_MIN work with CUDA, but
-    // std::numeric_limits<val_type>::min() does not, because it is
-    // not marked as a __device__ function.
-    return INT_MIN;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return INT_MAX; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
-                                                  const val_type y) {
-    return intPowSigned<val_type>(x, y);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
-    // This will result in no loss of accuracy, though it might be
-    // more expensive than it should, if we were clever about using
-    // bit operations.
-    return static_cast<val_type>(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::sqrt(static_cast<float>(abs(x)))
-#else
-        ::sqrt(static_cast<float>(abs(x)))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
-    return static_cast<val_type>(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::cbrt(static_cast<float>(abs(x)))
-#else
-        ::cbrt(static_cast<float>(abs(x)))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
-    return static_cast<val_type>(::exp(static_cast<double>(abs(x))));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
-    return static_cast<val_type>(::log(static_cast<double>(abs(x))));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
-    return static_cast<val_type>(::log10(static_cast<double>(abs(x))));
-  }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
-  //   return static_cast<val_type> ( ::sin (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) {
-  //   return static_cast<val_type> ( ::cos (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) {
-  //   return static_cast<val_type> ( ::tan (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) {
-  //   return static_cast<val_type> ( ::sinh (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) {
-  //   return static_cast<val_type> ( ::cosh (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) {
-  //   return static_cast<val_type> ( ::tanh (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) {
-  //   return static_cast<val_type> ( ::asin (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
-  //   return static_cast<val_type> ( ::acos (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
-  //   return static_cast<val_type> ( ::atan (static_cast<double> (x)));
-  // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
-    // int doesn't implement a NaN value, but we can still have it
-    // return some "flag" value that can help users find use of
-    // uninitialized data.
-    return -1;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
-
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  typedef val_type halfPrecision;
-  typedef val_type doublePrecision;
-
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = true;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
-    return abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
-    return conj(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
-    return false;
-  }
-  static std::string name() { return "int"; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
-    return sqrt(x);
-  }
-};
-
-template <>
-class ArithTraits<unsigned int> {
- public:
-  typedef unsigned int val_type;
-  typedef val_type mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = false;
-  static const bool is_integer     = true;
-  static const bool is_exact       = true;
-  static const bool is_complex     = false;
-
-  static constexpr bool has_infinity = false;
-  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
-
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
-    return x;  // it's unsigned, so it's positive
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return UINT_MAX; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
-                                                  const val_type y) {
-    return intPowUnsigned<val_type>(x, y);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
-    // This will result in no loss of accuracy, though it might be
-    // more expensive than it should, if we were clever about using
-    // bit operations.
-    return static_cast<val_type>(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::sqrt(static_cast<float>(abs(x)))
-#else
-        ::sqrt(static_cast<float>(abs(x)))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
-    return static_cast<val_type>(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::cbrt(static_cast<float>(abs(x)))
-#else
-        ::cbrt(static_cast<float>(abs(x)))
-#endif
-    );
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
-    return static_cast<val_type>(::exp(static_cast<double>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
-    return static_cast<val_type>(::log(static_cast<double>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
-    return static_cast<val_type>(::log10(static_cast<double>(x)));
-  }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
-  //   return static_cast<val_type> ( ::sin (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) {
-  //   return static_cast<val_type> ( ::cos (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) {
-  //   return static_cast<val_type> ( ::tan (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) {
-  //   return static_cast<val_type> ( ::sinh (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) {
-  //   return static_cast<val_type> ( ::cosh (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) {
-  //   return static_cast<val_type> ( ::tanh (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) {
-  //   return static_cast<val_type> ( ::asin (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
-  //   return static_cast<val_type> ( ::acos (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
-  //   return static_cast<val_type> ( ::atan (static_cast<double> (x)));
-  // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
-    // unsigned int doesn't implement a NaN value, but we can still
-    // have it return some "flag" value that can help users find use
-    // of uninitialized data.
-    return max();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
-
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  typedef val_type halfPrecision;
-  typedef val_type doublePrecision;
-
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = true;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
-    return abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
-    return conj(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
-    return false;
-  }
-  static std::string name() { return "unsigned int"; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
-    return sqrt(x);
-  }
-};
-
-template <>
-class ArithTraits<long> {
- public:
-  typedef long val_type;
-  typedef val_type mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = true;
-  static const bool is_integer     = true;
-  static const bool is_exact       = true;
-  static const bool is_complex     = false;
-
-  static constexpr bool has_infinity = false;
-  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
-
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
-    return x >= 0 ? x : -x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return LONG_MIN; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return LONG_MAX; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
-                                                  const val_type y) {
-    return intPowSigned<val_type>(x, y);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
-    using std::abs;
-    using std::sqrt;
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    return static_cast<val_type>(sqrt(static_cast<long double>(abs(x))));
-#else
-    return static_cast<val_type>(sqrt(static_cast<double>(abs(x))));
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
-    return static_cast<val_type>(::log(static_cast<double>(abs(x))));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
-    return static_cast<val_type>(::log10(static_cast<double>(abs(x))));
-  }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
-  //   return static_cast<val_type> ( ::sin (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) {
-  //   return static_cast<val_type> ( ::cos (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) {
-  //   return static_cast<val_type> ( ::tan (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) {
-  //   return static_cast<val_type> ( ::sinh (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) {
-  //   return static_cast<val_type> ( ::cosh (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) {
-  //   return static_cast<val_type> ( ::tanh (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) {
-  //   return static_cast<val_type> ( ::asin (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
-  //   return static_cast<val_type> ( ::acos (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
-  //   return static_cast<val_type> ( ::atan (static_cast<double> (x)));
-  // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
-    // long doesn't implement a NaN value, but we can still have it
-    // return some "flag" value that can help users find use of
-    // uninitialized data.
-    return -1;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
-
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  typedef val_type halfPrecision;
-  typedef val_type doublePrecision;
-
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = true;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
-    return abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
-    return conj(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
-    return false;
-  }
-  static std::string name() { return "long"; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
-    return sqrt(x);
-  }
-};
-
-template <>
-class ArithTraits<unsigned long> {
- public:
-  typedef unsigned long val_type;
-  typedef val_type mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = false;
-  static const bool is_integer     = true;
-  static const bool is_exact       = true;
-  static const bool is_complex     = false;
-
-  static constexpr bool has_infinity = false;
-  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
-
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return ULONG_MAX; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
-                                                  const val_type y) {
-    return intPowUnsigned<val_type>(x, y);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
-    using std::sqrt;
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    return static_cast<val_type>(sqrt(static_cast<long double>(x)));
-#else
-    return static_cast<val_type>(sqrt(static_cast<double>(x)));
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    using std::cbrtl;
-    return static_cast<val_type>(::cbrtl(static_cast<long double>(x)));
-#else
-    return static_cast<val_type>(
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-        sycl::cbrt(static_cast<double>(abs(x)))
-#else
-        ::cbrt(static_cast<double>(abs(x)))
-#endif
-    );
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
-    return static_cast<val_type>(::exp(static_cast<double>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
-    return static_cast<long>(::log(static_cast<double>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
-    return static_cast<long>(::log10(static_cast<double>(x)));
-  }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
-  //   return static_cast<val_type> ( ::sin (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) {
-  //   return static_cast<val_type> ( ::cos (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) {
-  //   return static_cast<val_type> ( ::tan (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) {
-  //   return static_cast<val_type> ( ::sinh (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) {
-  //   return static_cast<val_type> ( ::cosh (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) {
-  //   return static_cast<val_type> ( ::tanh (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) {
-  //   return static_cast<val_type> ( ::asin (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
-  //   return static_cast<val_type> ( ::acos (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
-  //   return static_cast<val_type> ( ::atan (static_cast<double> (x)));
-  // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
-    // unsigned long doesn't implement a NaN value, but we can still
-    // have it return some "flag" value that can help users find use
-    // of uninitialized data.
-    return max();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
-
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  typedef val_type halfPrecision;
-  typedef val_type doublePrecision;
-
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = true;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
-    return abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
-    return conj(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
-    return false;
-  }
-  static std::string name() { return "unsigned long"; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
-    return sqrt(x);
-  }
-};
-
-template <>
-class ArithTraits<long long> {
- public:
-  typedef long long val_type;
-  typedef val_type mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = true;
-  static const bool is_integer     = true;
-  static const bool is_exact       = true;
-  static const bool is_complex     = false;
-
-  static constexpr bool has_infinity = false;
-  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
-
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
-    return x >= 0 ? x : -x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return LLONG_MIN; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return LLONG_MAX; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
-                                                  const val_type y) {
-    return intPowSigned<val_type>(x, y);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    using std::abs;
-    using std::sqrt;
-    // IEEE 754 promises that long double has at least 64 significand
-    // bits, so we can use it to represent any signed or unsigned
-    // 64-bit integer type exactly.  However, CUDA does not implement
-    // long double for device functions.
-    return static_cast<val_type>(sqrt(static_cast<long double>(abs(x))));
-#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
-    // Casting from a 64-bit integer type to double does result in a
-    // loss of accuracy.  However, it gives us a good first
-    // approximation.  For very large numbers, we may lose some
-    // significand bits, but will always get within a factor of two
-    // (assuming correct rounding) of the exact double-precision
-    // number.  We could then binary search between half the result
-    // and twice the result (assuming the latter is <= INT64_MAX,
-    // which it has to be, so we don't have to check) to ensure
-    // correctness.  It actually should suffice to check numbers
-    // within 1 of the result.
-    return static_cast<val_type>(sycl::sqrt(static_cast<double>(abs(x))));
-#else
-    return static_cast<val_type>(::sqrt(static_cast<double>(abs(x))));
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    using std::abs;
-    using std::cbrtl;
-    return static_cast<val_type>(cbrtl(static_cast<long double>(abs(x))));
-#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
-    return static_cast<val_type>(sycl::cbrt(static_cast<double>(abs(x))));
-#else
-    return static_cast<val_type>(::cbrt(static_cast<double>(abs(x))));
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
-    return static_cast<val_type>(::exp(static_cast<double>(abs(x))));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
-    return static_cast<val_type>(::log(static_cast<double>(abs(x))));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
-    return static_cast<val_type>(::log10(static_cast<double>(abs(x))));
-  }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
-  //   return static_cast<val_type> ( ::sin (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) {
-  //   return static_cast<val_type> ( ::cos (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) {
-  //   return static_cast<val_type> ( ::tan (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) {
-  //   return static_cast<val_type> ( ::sinh (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) {
-  //   return static_cast<val_type> ( ::cosh (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) {
-  //   return static_cast<val_type> ( ::tanh (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) {
-  //   return static_cast<val_type> ( ::asin (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
-  //   return static_cast<val_type> ( ::acos (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
-  //   return static_cast<val_type> ( ::atan (static_cast<double> (x)));
-  // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
-    // long long doesn't implement a NaN value, but we can still have
-    // it return some "flag" value that can help users find use of
-    // uninitialized data.
-    return -1;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
-
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  typedef val_type halfPrecision;
-  typedef val_type doublePrecision;
-
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = true;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
-    return abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
-    return conj(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
-    return false;
-  }
-  static std::string name() { return "long long"; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
-    return sqrt(x);
-  }
-};
-
-template <>
-class ArithTraits<unsigned long long> {
- public:
-  typedef unsigned long long val_type;
-  typedef val_type mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = false;
-  static const bool is_integer     = true;
-  static const bool is_exact       = true;
-  static const bool is_complex     = false;
-
-  static constexpr bool has_infinity = false;
-  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
-
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
-    return false;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
-    return x;  // unsigned integers are always nonnegative
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return ULLONG_MAX; }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
-    return x;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
-                                                  const val_type y) {
-    return intPowUnsigned<val_type>(x, y);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    using std::sqrt;
-    return static_cast<val_type>(sqrt(static_cast<long double>(x)));
-#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
-    return static_cast<val_type>(sycl::sqrt(static_cast<double>(x)));
-#else
-    return static_cast<val_type>(::sqrt(static_cast<double>(x)));
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    using std::cbrtl;
-    return static_cast<val_type>(cbrtl(static_cast<long double>(x)));
-#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
-    return static_cast<val_type>(sycl::cbrt(static_cast<double>(x)));
-#else
-    return static_cast<val_type>(::cbrt(static_cast<double>(x)));
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
-    return static_cast<val_type>(::exp(static_cast<double>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
-    return static_cast<val_type>(::log(static_cast<double>(x)));
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
-    return static_cast<val_type>(::log10(static_cast<double>(x)));
-  }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
-  //   return static_cast<val_type> ( ::sin (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) {
-  //   return static_cast<val_type> ( ::cos (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) {
-  //   return static_cast<val_type> ( ::tan (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) {
-  //   return static_cast<val_type> ( ::sinh (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) {
-  //   return static_cast<val_type> ( ::cosh (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) {
-  //   return static_cast<val_type> ( ::tanh (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) {
-  //   return static_cast<val_type> ( ::asin (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
-  //   return static_cast<val_type> ( ::acos (static_cast<double> (x)));
-  // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
-  //   return static_cast<val_type> ( ::atan (static_cast<double> (x)));
-  // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
-    // unsigned long long doesn't implement a NaN value, but we can
-    // still have it return some "flag" value that can help users find
-    // use of uninitialized data.
-    return max();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
-
-  // Backwards compatibility with Teuchos::ScalarTraits.
-  typedef mag_type magnitudeType;
-  typedef val_type halfPrecision;
-  typedef val_type doublePrecision;
-
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = true;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
-    return abs(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
-    return conj(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
-    return false;
-  }
-  static std::string name() { return "unsigned long long"; }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
-    return sqrt(x);
-  }
-};
-
-// dd_real and qd_real are floating-point types provided by the QD
-// library of David Bailey (LBNL):
-//
-// http://crd-legacy.lbl.gov/~dhbailey/mpdist/
-//
-// dd_real uses two doubles (128 bits), and qd_real uses four doubles
-// (256 bits).
-//
-// Kokkos does <i>not</i> currently support these types in device
-// functions.  It should be possible to use Kokkos' support for
-// aggregate types to implement device function support for dd_real
-// and qd_real, but we have not done this yet (as of 09 Jan 2015).
-// Hence, the class methods of the ArithTraits specializations for
-// dd_real and qd_real are not marked as device functions.
-#ifdef HAVE_KOKKOS_QD
-template <>
-struct ArithTraits<dd_real> {
-  typedef dd_real val_type;
-  typedef dd_real mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = true;
-  static const bool is_integer     = false;
-  static const bool is_exact       = false;
-  static const bool is_complex     = false;
-
-  static inline bool isInf(const val_type& x) { return isinf(x); }
-  static inline bool isNan(const val_type& x) { return isnan(x); }
-  static inline mag_type abs(const val_type& x) { return ::abs(x); }
-  static inline val_type zero() { return val_type(0.0); }
-  static inline val_type one() { return val_type(1.0); }
-  static inline val_type min() { return std::numeric_limits<val_type>::min(); }
-  static inline val_type max() { return std::numeric_limits<val_type>::max(); }
-  static inline mag_type real(const val_type& x) { return x; }
-  static inline mag_type imag(const val_type&) { return zero(); }
-  static inline val_type conj(const val_type& x) { return x; }
-  static inline val_type pow(const val_type& x, const val_type& y) {
-    return ::pow(x, y);
-  }
-  static inline val_type sqrt(const val_type& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::sqrt(x);
-#else
-    return ::sqrt(x);
-#endif
-  }
-  static inline val_type cbrt(const val_type& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::cbrt(x);
-#else
-    return ::cbrt(x);
-#endif
-  }
-  static inline val_type exp(const val_type& x) { return ::exp(x); }
-  static inline val_type log(const val_type& x) {
-    // dd_real puts its transcendental functions in the global namespace.
-    return ::log(x);
-  }
-  static inline val_type log10(const val_type& x) { return ::log10(x); }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) {
-    return ::sin(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) {
-    return ::cos(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::tan(x);
-#else
-    return std::tan(x);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) {
-    return ::sinh(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) {
-    return ::cosh(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) {
-    return ::tanh(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::asin(x);
-#else
-    return ::asin(x);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::acos(x);
-#else
-    return ::acos(x);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::atan(x);
-#else
-    return ::atan(x);
-#endif
-  }
-  static inline val_type nan() { return val_type::_nan; }
-  static val_type epsilon() { return std::numeric_limits<val_type>::epsilon(); }
-
-  typedef dd_real magnitudeType;
-  typedef double halfPrecision;
-  typedef qd_real doublePrecision;
-
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = false;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = true;
-
-  static mag_type eps() { return epsilon(); }
-  static mag_type sfmin() { return min(); }
-  static int base() { return std::numeric_limits<val_type>::radix; }
-  static mag_type prec() { return eps() * base(); }
-  static int t() { return std::numeric_limits<val_type>::digits; }
-  static mag_type rnd() {
-    return std::numeric_limits<val_type>::round_style == std::round_to_nearest
-               ? one()
-               : zero();
-  }
-  static int emin() { return std::numeric_limits<val_type>::min_exponent; }
-  static mag_type rmin() { return std::numeric_limits<val_type>::min(); }
-  static int emax() { return std::numeric_limits<val_type>::max_exponent; }
-  static mag_type rmax() { return std::numeric_limits<val_type>::max(); }
-  static mag_type magnitude(const val_type& x) { return ::abs(x); }
-  static val_type conjugate(const val_type& x) { return conj(x); }
-  static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); }
-  static std::string name() { return "dd_real"; }
-  static val_type squareroot(const val_type& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::sqrt(x);
-#else
-    return ::sqrt(x);
-#endif
-  }
-};
-
-template <>
-struct ArithTraits<qd_real> {
-  typedef qd_real val_type;
-  typedef qd_real mag_type;
-
-  static const bool is_specialized = true;
-  static const bool is_signed      = true;
-  static const bool is_integer     = false;
-  static const bool is_exact       = false;
-  static const bool is_complex     = false;
-
-  static inline bool isInf(const val_type& x) { return isinf(x); }
-  static inline bool isNan(const val_type& x) { return isnan(x); }
-  static inline mag_type abs(const val_type& x) { return ::abs(x); }
-  static inline val_type zero() { return val_type(0.0); }
-  static inline val_type one() { return val_type(1.0); }
-  static inline val_type min() { return std::numeric_limits<val_type>::min(); }
-  static inline val_type max() { return std::numeric_limits<val_type>::max(); }
-  static inline mag_type real(const val_type& x) { return x; }
-  static inline mag_type imag(const val_type&) { return zero(); }
-  static inline val_type conj(const val_type& x) { return x; }
-  static inline val_type pow(const val_type& x, const val_type& y) {
-    return ::pow(x, y);
-  }
-  static inline val_type sqrt(const val_type& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::sqrt(x);
-#else
-    return ::sqrt(x);
-#endif
-  }
-  static inline val_type cbrt(const val_type& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::cbrt(x);
-#else
-    return ::cbrt(x);
-#endif
-  }
-  static inline val_type exp(const val_type& x) { return ::exp(x); }
-  static inline val_type log(const val_type& x) {
-    // val_type puts its transcendental functions in the global namespace.
-    return ::log(x);
-  }
-  static inline val_type log10(const val_type& x) { return ::log10(x); }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) {
-    return ::sin(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) {
-    return ::cos(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::tan(x);
-#else
-    return std::tan(x);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) {
-    return ::sinh(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) {
-    return ::cosh(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) {
-    return ::tanh(x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::asin(x);
-#else
-    return ::asin(x);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::acos(x);
-#else
-    return ::acos(x);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::atan(x);
-#else
-    return ::atan(x);
-#endif
-  }
-  static inline val_type nan() { return val_type::_nan; }
-  static inline val_type epsilon() {
-    return std::numeric_limits<val_type>::epsilon();
-  }
-
-  typedef qd_real magnitudeType;
-  typedef dd_real halfPrecision;
-  // The QD library does not have an "oct-double real" class.  One
-  // could use an arbitrary-precision library like MPFR or ARPREC,
-  // with the precision set appropriately, to get an
-  // extended-precision type for qd_real.
-  typedef qd_real doublePrecision;
-
-  static const bool isComplex            = false;
-  static const bool isOrdinal            = false;
-  static const bool isComparable         = true;
-  static const bool hasMachineParameters = true;
-
-  static mag_type eps() { return epsilon(); }
-  static mag_type sfmin() { return min(); }
-  static int base() { return std::numeric_limits<val_type>::radix; }
-  static mag_type prec() { return eps() * base(); }
-  static int t() { return std::numeric_limits<val_type>::digits; }
-  static mag_type rnd() {
-    return std::numeric_limits<val_type>::round_style == std::round_to_nearest
-               ? one()
-               : zero();
-  }
-  static int emin() { return std::numeric_limits<val_type>::min_exponent; }
-  static mag_type rmin() { return std::numeric_limits<val_type>::min(); }
-  static int emax() { return std::numeric_limits<val_type>::max_exponent; }
-  static mag_type rmax() { return std::numeric_limits<val_type>::max(); }
-  static mag_type magnitude(const val_type& x) { return ::abs(x); }
-  static val_type conjugate(const val_type& x) { return conj(x); }
-  static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); }
-  static std::string name() { return "qd_real"; }
-  static val_type squareroot(const val_type& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    return sycl::sqrt(x);
-#else
-    return ::sqrt(x);
-#endif
-  }
-};
-#endif  // HAVE_KOKKOS_QD
-
-}  // namespace Details
-
-// Promote ArithTraits into Kokkos namespace.  At some point, we
-// will remove it from the Details namespace completely.  We leave
-// it there for now, because a lot of code depends on it being
-// there.
-using Details::ArithTraits;
-}  // namespace Kokkos
-
-#endif  // KOKKOS_ARITHTRAITS_HPP
diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp
index 2b523e1e5f..46b97ee039 100644
--- a/src/batched/KokkosBatched_Util.hpp
+++ b/src/batched/KokkosBatched_Util.hpp
@@ -123,9 +123,7 @@ struct Flush {
   void init(value_type &update) { update = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type &update, const volatile value_type &input) {
-    update += input;
-  }
+  void join(value_type &update, const value_type &input) { update += input; }
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const int i, value_type &update) const { update += _buf[i]; }
@@ -201,7 +199,8 @@ struct SIMD {
                     std::is_same<T, std::complex<float> >::value ||
                     std::is_same<T, Kokkos::complex<double> >::value ||
                     std::is_same<T, std::complex<double> >::value ||
-                    std::is_same<T, Kokkos::Experimental::half_t>::value,
+                    std::is_same<T, Kokkos::Experimental::half_t>::value ||
+                    std::is_same<T, Kokkos::Experimental::bhalf_t>::value,
                 "KokkosKernels:: Invalid SIMD<> type.");
   using value_type = T;
 };
@@ -718,6 +717,17 @@ KOKKOS_INLINE_FUNCTION
   iMatrix = iTemp / numRows;
 }
 
+template <typename OrdinalType, typename layout>
+KOKKOS_INLINE_FUNCTION
+    typename std::enable_if<std::is_same<layout, Kokkos::LayoutStride>::value,
+                            void>::type
+    getIndices(const OrdinalType iTemp, const OrdinalType /*numRows*/,
+               const OrdinalType numMatrices, OrdinalType &iRow,
+               OrdinalType &iMatrix) {
+  iRow    = iTemp / numMatrices;
+  iMatrix = iTemp % numMatrices;
+}
+
 template <class ViewType>
 KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v, const int *order) {
   constexpr int rank         = 2;
@@ -842,10 +852,9 @@ KOKKOS_INLINE_FUNCTION ViewValueType fma_alpha(ViewValueType reg_c,
 
 template <class ViewValueType, class ScalarType>
 KOKKOS_INLINE_FUNCTION ViewValueType fma_alpha(ViewValueType reg_c,
-                                               ScalarType alpha,
+                                               ScalarType /*alpha*/,
                                                const AlphaTag::No &) {
   return reg_c;
-  (void)alpha;
 }
 
 template <class ViewType, class SizeType, class ViewValueType, class ScalarType,
diff --git a/src/batched/dense/KokkosBatched_Copy_Decl.hpp b/src/batched/dense/KokkosBatched_Copy_Decl.hpp
index c12c8d7209..af240c7d8b 100644
--- a/src/batched/dense/KokkosBatched_Copy_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Copy_Decl.hpp
@@ -11,7 +11,7 @@ namespace KokkosBatched {
 /// Serial Copy
 ///
 
-template <typename ArgTrans, int rank = 2>
+template <typename ArgTrans = Trans::NoTranspose, int rank = 2>
 struct SerialCopy {
   template <typename AViewType, typename BViewType>
   KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A,
diff --git a/src/batched/dense/KokkosBatched_Gesv.hpp b/src/batched/dense/KokkosBatched_Gesv.hpp
new file mode 100644
index 0000000000..cda2225c43
--- /dev/null
+++ b/src/batched/dense/KokkosBatched_Gesv.hpp
@@ -0,0 +1,180 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_GESV_HPP__
+#define __KOKKOSBATCHED_GESV_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+
+namespace KokkosBatched {
+
+struct Gesv {
+  struct StaticPivoting {};
+  struct NoPivoting {};
+
+  using Default = StaticPivoting;
+};
+
+/// \brief Serial Batched GESV:
+///
+/// Solve A_l x_l = b_l for all l = 0, ..., N
+/// using a batched LU decomposition, 2 batched triangular solves, and a batched
+/// static pivoting.
+///
+/// \tparam MatrixType: Input type for the matrix, needs to be a 2D view
+/// \tparam VectorType: Input type for the right-hand side and the solution,
+/// needs to be a 1D view
+///
+/// \param A [in]: matrix, a rank 2 view
+/// \param X [out]: solution, a rank 1 view
+/// \param B [in]: right-hand side, a rank 1 view
+/// \param tmp [in]: a rank 2 view used to store temporary variable; dimension
+/// must be n x (n+4) where n is the number of rows.
+///
+///
+/// Two versions are available (those are chosen based on ArgAlgo):
+///
+///   1. NoPivoting: the solver does not use a pivoting strategy,
+///   2. StaticPivoting: the solver uses a static pivoting strategy that relies
+///   on using
+///      maximal absolute value of row and column to choose pivots and apply
+///      them before calling the LU decomposition. Known limitation: the
+///      currently implemented strategy would not work with some matrices such
+///      as [[2, 1], [1, 0]], when this is the case, the Gesv (if used with
+///      pivoting), will return 1 and print an error message.
+///
+/// No nested parallel_for is used inside of the function.
+///
+
+template <typename ArgAlgo>
+struct SerialGesv {
+  template <typename MatrixType, typename VectorType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A,
+                                           const VectorType X,
+                                           const VectorType Y,
+                                           const MatrixType tmp);
+};
+
+/// \brief Team Batched GESV:
+///
+/// Solve A_l x_l = b_l for all l = 0, ..., N
+/// using a batched LU decomposition, 2 batched triangular solves, and a batched
+/// static pivoting.
+///
+/// \tparam MatrixType: Input type for the matrix, needs to be a 2D view
+/// \tparam VectorType: Input type for the right-hand side and the solution,
+/// needs to be a 1D view
+///
+/// \param member [in]: TeamPolicy member
+/// \param A [in]: matrix, a rank 2 view
+/// \param X [out]: solution, a rank 1 view
+/// \param B [in]: right-hand side, a rank 1 view
+///
+/// Two versions are available (those are chosen based on ArgAlgo):
+///
+///   1. NoPivoting: the solver does not use a pivoting strategy,
+///   2. StaticPivoting: the solver uses a static pivoting strategy that relies
+///   on using
+///      maximal absolute value of row and column to choose pivots and apply
+///      them before calling the LU decomposition. Known limitation: the
+///      currently implemented strategy would not work with some matrices such
+///      as [[2, 1], [1, 0]], when this is the case, the Gesv (if used with
+///      pivoting), will return 1 and print an error message.
+///
+/// A nested parallel_for with TeamThreadRange is used.
+///
+
+template <typename MemberType, typename ArgAlgo>
+struct TeamGesv {
+  template <typename MatrixType, typename VectorType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const MatrixType A,
+                                           const VectorType X,
+                                           const VectorType Y);
+};
+
+/// \brief Team Vector Batched GESV:
+///
+/// Solve A_l x_l = b_l for all l = 0, ..., N
+/// using a batched LU decomposition, 2 batched triangular solves, and a batched
+/// static pivoting.
+///
+/// \tparam MatrixType: Input type for the matrix, needs to be a 2D view
+/// \tparam VectorType: Input type for the right-hand side and the solution,
+/// needs to be a 1D view
+///
+/// \param member [in]: TeamPolicy member
+/// \param A [in]: matrix, a rank 2 view
+/// \param X [out]: solution, a rank 1 view
+/// \param B [in]: right-hand side, a rank 1 view
+///
+/// Two versions are available (those are chosen based on ArgAlgo):
+///
+///   1. NoPivoting: the solver does not use a pivoting strategy,
+///   2. StaticPivoting: the solver uses a static pivoting strategy that relies
+///   on using
+///      maximal absolute value of row and column to choose pivots and apply
+///      them before calling the LU decomposition. Known limitation: the
+///      currently implemented strategy would not work with some matrices such
+///      as [[2, 1], [1, 0]], when this is the case, the Gesv (if used with
+///      pivoting), will return 1 and print an error message.
+///
+///   Two nested parallel_for with both TeamVectorRange and ThreadVectorRange
+///   (or one with TeamVectorRange) are used inside.
+///
+
+template <typename MemberType, typename ArgAlgo>
+struct TeamVectorGesv {
+  template <typename MatrixType, typename VectorType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const MatrixType A,
+                                           const VectorType X,
+                                           const VectorType Y);
+};
+
+}  // namespace KokkosBatched
+
+#include "KokkosBatched_Gesv_Impl.hpp"
+
+#endif
diff --git a/src/batched/dense/KokkosBatched_LU_Decl.hpp b/src/batched/dense/KokkosBatched_LU_Decl.hpp
index 8cffbdc766..9fa2e2b6e3 100644
--- a/src/batched/dense/KokkosBatched_LU_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_LU_Decl.hpp
@@ -51,4 +51,7 @@ struct LU {
 
 }  // namespace KokkosBatched
 
+#include "KokkosBatched_LU_Serial_Impl.hpp"
+#include "KokkosBatched_LU_Team_Impl.hpp"
+
 #endif
diff --git a/src/batched/dense/KokkosBatched_Scale_Decl.hpp b/src/batched/dense/KokkosBatched_Scale_Decl.hpp
index f3ea9b0aab..f0675892fc 100644
--- a/src/batched/dense/KokkosBatched_Scale_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Scale_Decl.hpp
@@ -3,8 +3,7 @@
 
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
-#include "KokkosBatched_Util.hpp"
-#include "KokkosBatched_Vector.hpp"
+#include "impl/Kokkos_Error.hpp"
 
 namespace KokkosBatched {
 
@@ -12,38 +11,50 @@ namespace KokkosBatched {
 /// Serial Scale
 ///
 
-struct SerialScale {
-  template <typename ScalarType, typename AViewType>
-  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
-                                           const AViewType &A);
-};
+struct [[deprecated]] SerialScale{
+    template <typename ScalarType, typename AViewType>
+    KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                             const AViewType &A){Kokkos::abort(
+        "KokkosBatched::SerialScale is deprecated: use KokkosBlas::SerialScale "
+        "instead");
+return 0;
+}  // namespace KokkosBatched
+}
+;
 
 ///
 /// Team Scale
 ///
 
 template <typename MemberType>
-struct TeamScale {
-  template <typename ScalarType, typename AViewType>
-  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
-                                           const ScalarType alpha,
-                                           const AViewType &A);
-};
+struct [[deprecated]] TeamScale{
+    template <typename ScalarType, typename AViewType>
+    KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                             const ScalarType alpha,
+                                             const AViewType &A){Kokkos::abort(
+        "KokkosBatched::TeamScale is deprecated: use KokkosBlas::TeamScale "
+        "instead");
+return 0;
+}
+}
+;
 
 ///
 /// TeamVector Scale
 ///
 
 template <typename MemberType>
-struct TeamVectorScale {
-  template <typename ScalarType, typename AViewType>
-  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
-                                           const ScalarType alpha,
-                                           const AViewType &A);
-};
+struct [[deprecated]] TeamVectorScale{
+    template <typename ScalarType, typename AViewType>
+    KOKKOS_INLINE_FUNCTION static int invoke(
+        const MemberType &member, const ScalarType alpha, const AViewType &A){
+        Kokkos::abort("KokkosBatched::TeamVectorScale is deprecated: use "
+                      "KokkosBlas::TeamVectorScale instead");
+return 0;
+}
+}
+;
 
 }  // namespace KokkosBatched
 
-#include "KokkosBatched_Scale_Impl.hpp"
-
 #endif
diff --git a/src/batched/dense/KokkosBatched_Set_Decl.hpp b/src/batched/dense/KokkosBatched_Set_Decl.hpp
index 4ef0078e50..fd67cdc99b 100644
--- a/src/batched/dense/KokkosBatched_Set_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Set_Decl.hpp
@@ -3,46 +3,57 @@
 
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
-#include "KokkosBatched_Util.hpp"
-#include "KokkosBatched_Vector.hpp"
+#include "impl/Kokkos_Error.hpp"
 
 namespace KokkosBatched {
 ///
 /// Serial Set
 ///
 
-struct SerialSet {
-  template <typename ScalarType, typename AViewType>
-  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
-                                           const AViewType &A);
-};
+struct [[deprecated]] SerialSet{
+    template <typename ScalarType, typename AViewType>
+    KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                             const AViewType &A){Kokkos::abort(
+        "KokkosBatched::SerialSet is deprecated: use KokkosBlas::SerialSet "
+        "instead");
+return 0;
+}  // namespace KokkosBatched
+}
+;
 
 ///
 /// Team Set
 ///
 
 template <typename MemberType>
-struct TeamSet {
-  template <typename ScalarType, typename AViewType>
-  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
-                                           const ScalarType alpha,
-                                           const AViewType &A);
-};
+struct [[deprecated]] TeamSet{
+    template <typename ScalarType, typename AViewType>
+    KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                             const ScalarType alpha,
+                                             const AViewType &A){Kokkos::abort(
+        "KokkosBatched::TeamSet is deprecated: use KokkosBlas::TeamSet "
+        "instead");
+return 0;
+}
+}
+;
 
 ///
 /// TeamVector Set
 ///
 
 template <typename MemberType>
-struct TeamVectorSet {
-  template <typename ScalarType, typename AViewType>
-  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
-                                           const ScalarType alpha,
-                                           const AViewType &A);
-};
+struct [[deprecated]] TeamVectorSet{
+    template <typename ScalarType, typename AViewType>
+    KOKKOS_INLINE_FUNCTION static int invoke(
+        const MemberType &member, const ScalarType alpha, const AViewType &A){
+        Kokkos::abort("KokkosBatched::TeamVectorSet is deprecated: use "
+                      "KokkosBlas::TeamVectorSet instead");
+return 0;
+}
+}
+;
 
 }  // namespace KokkosBatched
 
-#include "KokkosBatched_Set_Impl.hpp"
-
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp b/src/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp
index f11210253e..32980219bf 100644
--- a/src/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp
@@ -43,7 +43,7 @@ struct TeamVectorFindAmaxInternal {
     if (m > 0) {
       using reducer_value_type =
           typename Kokkos::MaxLoc<ValueType, IntType>::value_type;
-      reducer_value_type value;
+      reducer_value_type value{};
       Kokkos::MaxLoc<ValueType, IntType> reducer_value(value);
       Kokkos::parallel_reduce(
           Kokkos::TeamVectorRange(member, m),
diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index 7bc5529fcc..d6331e215d 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -59,6 +59,21 @@ namespace Impl {
 /// CT/NT, NT/CT, CT/CT
 ///
 
+struct LayoutLeftTag {};
+struct LayoutRightTag {};
+template <class>
+struct TagFromLayoutHelper;
+template <>
+struct TagFromLayoutHelper<Kokkos::LayoutLeft> {
+  using tag = LayoutLeftTag;
+};
+template <>
+struct TagFromLayoutHelper<Kokkos::LayoutRight> {
+  using tag = LayoutRightTag;
+};
+template <class Layout>
+using TagFromLayout = typename TagFromLayoutHelper<Layout>::tag;
+
 // TODO - scaling between (32x32, 64x64)
 //   Option 0: Increase number of tiles and figure out how to map kokkos teams
 //             into cuda grid. Keep team size and vector lanes constant.
@@ -117,7 +132,8 @@ class BatchedDblBufGemm {
 
  private:
   void __run() {
-    using policy_type = Kokkos::TeamPolicy<layout_type, execution_space_type>;
+    using policy_type =
+        Kokkos::TeamPolicy<TagFromLayout<layout_type>, execution_space_type>;
     using member_type = typename policy_type::member_type;
 
     // Compile-time expressions required for functor-level register allocations:
@@ -335,8 +351,7 @@ class BatchedDblBufGemm {
     }
 
     KOKKOS_INLINE_FUNCTION
-    void operator()(const Kokkos::LayoutRight &,
-                    const MemberType &member) const {
+    void operator()(LayoutRightTag, const MemberType &member) const {
       // TODO: use Kokkos view with compile-time size to allocating register??
       //  Then we can use local deep copy for prefetch_reg population.
       // Allocate registers used for prefetching
@@ -503,8 +518,7 @@ class BatchedDblBufGemm {
     }
 
     KOKKOS_INLINE_FUNCTION
-    void operator()(const Kokkos::LayoutLeft &,
-                    const MemberType &member) const {
+    void operator()(LayoutLeftTag, const MemberType &member) const {
       // TODO: use Kokkos view with compile-time size to allocating register??
       //  Then we can use local deep copy for prefetch_reg population.
       // Allocate registers used for prefetching
diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp
index f2b009fe2f..1548d602e2 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp
@@ -5,8 +5,8 @@
 
 #include "KokkosBatched_Util.hpp"
 
-#include "KokkosBatched_Set_Internal.hpp"
-#include "KokkosBatched_Scale_Internal.hpp"
+#include "KokkosBlas1_set_impl.hpp"
+#include "KokkosBlas1_serial_scal_impl.hpp"
 
 #include "KokkosBatched_InnerGemmFixC_Serial_Impl.hpp"
 
@@ -41,9 +41,9 @@ KOKKOS_INLINE_FUNCTION int SerialGemmInternal<Algo::Gemm::Unblocked>::invoke(
   const ScalarType one(1.0), zero(0.0);
 
   if (beta == zero)
-    SerialSetInternal ::invoke(m, n, zero, C, cs0, cs1);
+    KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, C, cs0, cs1);
   else if (beta != one)
-    SerialScaleInternal::invoke(m, n, beta, C, cs0, cs1);
+    KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, beta, C, cs0, cs1);
 
   if (alpha != zero) {
     if (m <= 0 || n <= 0 || k <= 0) return 0;
@@ -81,9 +81,9 @@ KOKKOS_INLINE_FUNCTION int SerialGemmInternal<Algo::Gemm::Blocked>::invoke(
   const ScalarType one(1.0), zero(0.0);
 
   if (beta == zero)
-    SerialSetInternal ::invoke(m, n, zero, C, cs0, cs1);
+    KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, C, cs0, cs1);
   else if (beta != one)
-    SerialScaleInternal::invoke(m, n, beta, C, cs0, cs1);
+    KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, beta, C, cs0, cs1);
 
   if (alpha != zero) {
     if (m <= 0 || n <= 0 || k <= 0) return 0;
diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp
index b0c1f9c1ae..a516f765a1 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp
@@ -5,8 +5,8 @@
 
 #include "KokkosBatched_Util.hpp"
 
-#include "KokkosBatched_Set_Internal.hpp"
-#include "KokkosBatched_Scale_Internal.hpp"
+#include "KokkosBlas1_set_impl.hpp"
+#include "KokkosBlas1_team_scal_impl.hpp"
 
 namespace KokkosBatched {
 
@@ -39,9 +39,11 @@ TeamVectorGemmInternal<Algo::Gemm::Unblocked, false>::invoke(
   const ScalarType one(1.0), zero(0.0);
 
   if (beta == zero)
-    TeamVectorSetInternal ::invoke(member, m, n, zero, C, cs0, cs1);
+    KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, C, cs0,
+                                                    cs1);
   else if (beta != one)
-    TeamVectorScaleInternal::invoke(member, m, n, beta, C, cs0, cs1);
+    KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, beta, C,
+                                                      cs0, cs1);
 
   if (alpha != ScalarType(0.0)) {
     if (m <= 0 || n <= 0 || k <= 0) return 0;
@@ -79,9 +81,11 @@ TeamVectorGemmInternal<Algo::Gemm::Unblocked, true>::invoke(
   const ScalarType one(1.0), zero(0.0);
 
   if (beta == zero)
-    TeamVectorSetInternal ::invoke(member, m, n, zero, C, cs0, cs1);
+    KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, C, cs0,
+                                                    cs1);
   else if (beta != one)
-    TeamVectorScaleInternal::invoke(member, m, n, beta, C, cs0, cs1);
+    KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, beta, C,
+                                                      cs0, cs1);
 
   if (alpha != ScalarType(0.0)) {
     if (m <= 0 || n <= 0 || k <= 0) return 0;
diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp
index 73d831586b..4f147a98fc 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp
@@ -6,8 +6,8 @@
 #include "KokkosBatched_Util.hpp"
 #include "KokkosKernels_ExecSpaceUtils.hpp"
 
-#include "KokkosBatched_Set_Internal.hpp"
-#include "KokkosBatched_Scale_Internal.hpp"
+#include "KokkosBlas1_set_impl.hpp"
+#include "KokkosBlas1_team_scal_impl.hpp"
 
 #include "KokkosBatched_InnerGemmFixC_Serial_Impl.hpp"
 
@@ -41,9 +41,10 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal<Algo::Gemm::Unblocked>::invoke(
   const ScalarType one(1.0), zero(0.0);
 
   if (beta == zero)
-    TeamSetInternal ::invoke(member, m, n, zero, C, cs0, cs1);
+    KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, C, cs0, cs1);
   else if (beta != one)
-    TeamScaleInternal::invoke(member, m, n, beta, C, cs0, cs1);
+    KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, beta, C, cs0,
+                                                cs1);
 
   if (alpha != ScalarType(0.0)) {
     if (m <= 0 || n <= 0 || k <= 0) return 0;
@@ -82,9 +83,10 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal<Algo::Gemm::Blocked>::invoke(
   const ScalarType one(1.0), zero(0.0);
 
   if (beta == zero)
-    TeamSetInternal ::invoke(member, m, n, zero, C, cs0, cs1);
+    KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, C, cs0, cs1);
   else if (beta != one)
-    TeamScaleInternal::invoke(member, m, n, beta, C, cs0, cs1);
+    KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, beta, C, cs0,
+                                                cs1);
 
   if (alpha != ScalarType(0.0)) {
     if (m <= 0 || n <= 0 || k <= 0) return 0;
diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp
index fbd4a1e2d3..ef499b82fd 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp
@@ -5,9 +5,8 @@
 
 #include "KokkosBatched_Util.hpp"
 
-#include "KokkosBatched_Set_Internal.hpp"
-#include "KokkosBatched_Scale_Internal.hpp"
-
+#include "KokkosBlas1_set_impl.hpp"
+#include "KokkosBlas1_serial_scal_impl.hpp"
 #include "KokkosBatched_InnerMultipleDotProduct_Serial_Impl.hpp"
 
 namespace KokkosBatched {
@@ -39,9 +38,9 @@ KOKKOS_INLINE_FUNCTION int SerialGemvInternal<Algo::Gemv::Unblocked>::invoke(
   // y (m), A(m x n), B(n)
 
   if (beta == zero)
-    SerialSetInternal ::invoke(m, zero, y, ys0);
+    KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, y, ys0);
   else if (beta != one)
-    SerialScaleInternal::invoke(m, beta, y, ys0);
+    KokkosBlas::Impl::SerialScaleInternal::invoke(m, beta, y, ys0);
 
   if (alpha != zero) {
     if (m <= 0 || n <= 0) return 0;
@@ -78,9 +77,9 @@ KOKKOS_INLINE_FUNCTION int SerialGemvInternal<Algo::Gemv::Blocked>::invoke(
   constexpr int mbAlgo = Algo::Gemv::Blocked::mb();
 
   if (beta == zero)
-    SerialSetInternal ::invoke(m, zero, y, ys0);
+    KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, y, ys0);
   else if (beta != one)
-    SerialScaleInternal::invoke(m, beta, y, ys0);
+    KokkosBlas::Impl::SerialScaleInternal::invoke(m, beta, y, ys0);
 
   if (alpha != zero) {
     if (m <= 0 || n <= 0) return 0;
diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp
index 7e21019f94..0cad2c6c80 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp
@@ -30,9 +30,17 @@ struct TeamVectorGemv<MemberType, Trans::NoTranspose, Algo::Gemv::Unblocked> {
   KOKKOS_INLINE_FUNCTION static int invoke(
       const MemberType &member, const ScalarType alpha, const AViewType &A,
       const xViewType &x, const ScalarType beta, const yViewType &y) {
-    return TeamVectorGemvInternal<Algo::Gemv::Unblocked>::invoke(
-        member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(),
-        A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+    if (AViewType::Rank == 2)
+      return TeamVectorGemvInternal<Algo::Gemv::Unblocked>::invoke(
+          member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(),
+          A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+    else
+      return TeamVectorGemvInternal<Algo::Gemv::Unblocked>::template invoke<
+          MemberType, ScalarType, typename AViewType::array_layout,
+          typename AViewType::non_const_value_type>(
+          member, A.extent(0), A.extent(1), A.extent(2), alpha, A.data(),
+          A.stride_0(), A.stride_1(), A.stride_2(), x.data(), x.stride_0(),
+          x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1());
   }
 };
 
@@ -60,9 +68,17 @@ struct TeamVectorGemv<MemberType, Trans::Transpose, Algo::Gemv::Unblocked> {
   KOKKOS_INLINE_FUNCTION static int invoke(
       const MemberType &member, const ScalarType alpha, const AViewType &A,
       const xViewType &x, const ScalarType beta, const yViewType &y) {
-    return TeamVectorGemvInternal<Algo::Gemv::Unblocked>::invoke(
-        member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(),
-        A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+    if (AViewType::Rank == 2)
+      return TeamVectorGemvInternal<Algo::Gemv::Unblocked>::invoke(
+          member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(),
+          A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+    else
+      return TeamVectorGemvInternal<Algo::Gemv::Unblocked>::template invoke<
+          MemberType, ScalarType, typename AViewType::array_layout,
+          typename AViewType::non_const_value_type>(
+          member, A.extent(0), A.extent(2), A.extent(1), alpha, A.data(),
+          A.stride_0(), A.stride_2(), A.stride_1(), x.data(), x.stride_0(),
+          x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1());
   }
 };
 
diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp
index f4054030a3..406115aa4f 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp
@@ -5,9 +5,8 @@
 
 #include "KokkosBatched_Util.hpp"
 
-#include "KokkosBatched_Set_Internal.hpp"
-#include "KokkosBatched_Scale_Internal.hpp"
-
+#include "KokkosBlas1_set_impl.hpp"
+#include "KokkosBlas1_team_scal_impl.hpp"
 #include "KokkosBatched_InnerMultipleDotProduct_Serial_Impl.hpp"
 
 namespace KokkosBatched {
@@ -28,6 +27,20 @@ struct TeamVectorGemvInternal {
     assert(false && "Error: encounter dummy impl");
     return 0;
   }
+  template <typename MemberType, typename ScalarType, typename layout,
+            typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType & /*member*/, const int /*N*/, const int /*m*/,
+      const int /*n*/, const ScalarType /*alpha*/,
+      const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/,
+      const int /*as1*/, const int /*as2*/,
+      const ValueType *KOKKOS_RESTRICT /*x*/, const int /*xs0*/,
+      const int /*xs1*/, const ScalarType /*beta*/,
+      /**/ ValueType *KOKKOS_RESTRICT /*y*/, const int /*ys0*/,
+      const int /*ys1*/) {
+    assert(false && "Error: encounter dummy impl");
+    return 0;
+  }
 };
 
 template <>
@@ -44,9 +57,9 @@ TeamVectorGemvInternal<Algo::Gemv::Unblocked>::invoke(
   // y (m), A(m x n), B(n)
 
   if (beta == zero)
-    TeamVectorSetInternal ::invoke(member, m, zero, y, ys0);
+    KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, zero, y, ys0);
   else if (beta != one)
-    TeamVectorScaleInternal::invoke(member, m, beta, y, ys0);
+    KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, beta, y, ys0);
 
   if (alpha != zero) {
     if (m <= 0 || n <= 0) return 0;
@@ -69,6 +82,55 @@ TeamVectorGemvInternal<Algo::Gemv::Unblocked>::invoke(
   return 0;
 }
 
+template <>
+template <typename MemberType, typename ScalarType, typename layout,
+          typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+TeamVectorGemvInternal<Algo::Gemv::Unblocked>::invoke(
+    const MemberType &member, const int N, const int m, const int n,
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0,
+    const int as1, const int as2, const ValueType *KOKKOS_RESTRICT X,
+    const int xs0, const int xs1, const ScalarType beta,
+    /**/ ValueType *KOKKOS_RESTRICT Y, const int ys0, const int ys1) {
+  const ScalarType one(1.0), zero(0.0);
+
+  // y_l = beta y_l + alpha A_l x_l for l in range(0, N)
+  // y_l (m), A_l(m x n), B_l(n)
+
+  if (beta == zero)
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m),
+                         [&](const int &iTemp) {
+                           int iRow, iMatrix;
+                           getIndices<int, layout>(iTemp, m, N, iRow, iMatrix);
+                           Y[ys0 * iMatrix + ys1 * iRow] = zero;
+                         });
+  else if (beta != one)
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m),
+                         [&](const int &iTemp) {
+                           int iRow, iMatrix;
+                           getIndices<int, layout>(iTemp, m, N, iRow, iMatrix);
+                           Y[ys0 * iMatrix + ys1 * iRow] *= beta;
+                         });
+
+  if (alpha != zero) {
+    if (m <= 0 || n <= 0) return 0;
+
+    if (beta != one) member.team_barrier();
+
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m),
+                         [&](const int &iTemp) {
+                           int iRow, iMatrix;
+                           ValueType t(0);
+                           getIndices<int, layout>(iTemp, m, N, iRow, iMatrix);
+                           for (int i = 0; i < n; ++i)
+                             t += A[as0 * iMatrix + as1 * iRow + as2 * i] *
+                                  X[xs0 * iMatrix + xs1 * i];
+                           Y[ys0 * iMatrix + ys1 * iRow] += alpha * t;
+                         });
+  }
+  return 0;
+}
+
 }  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp
index 73ee2b9ad3..d32232524a 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp
@@ -30,9 +30,17 @@ struct TeamGemv<MemberType, Trans::NoTranspose, Algo::Gemv::Unblocked> {
   KOKKOS_INLINE_FUNCTION static int invoke(
       const MemberType &member, const ScalarType alpha, const AViewType &A,
       const xViewType &x, const ScalarType beta, const yViewType &y) {
-    return TeamGemvInternal<Algo::Gemv::Unblocked>::invoke(
-        member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(),
-        A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+    if (AViewType::Rank == 2)
+      return TeamGemvInternal<Algo::Gemv::Unblocked>::invoke(
+          member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(),
+          A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+    else
+      return TeamGemvInternal<Algo::Gemv::Unblocked>::template invoke<
+          MemberType, ScalarType, typename AViewType::array_layout,
+          typename AViewType::non_const_value_type>(
+          member, A.extent(0), A.extent(1), A.extent(2), alpha, A.data(),
+          A.stride_0(), A.stride_1(), A.stride_2(), x.data(), x.stride_0(),
+          x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1());
   }
 };
 
@@ -60,9 +68,17 @@ struct TeamGemv<MemberType, Trans::Transpose, Algo::Gemv::Unblocked> {
   KOKKOS_INLINE_FUNCTION static int invoke(
       const MemberType &member, const ScalarType alpha, const AViewType &A,
       const xViewType &x, const ScalarType beta, const yViewType &y) {
-    return TeamGemvInternal<Algo::Gemv::Unblocked>::invoke(
-        member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(),
-        A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+    if (AViewType::Rank == 2)
+      return TeamGemvInternal<Algo::Gemv::Unblocked>::invoke(
+          member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(),
+          A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+    else
+      return TeamGemvInternal<Algo::Gemv::Unblocked>::template invoke<
+          MemberType, ScalarType, typename AViewType::array_layout,
+          typename AViewType::non_const_value_type>(
+          member, A.extent(0), A.extent(2), A.extent(1), alpha, A.data(),
+          A.stride_0(), A.stride_2(), A.stride_1(), x.data(), x.stride_0(),
+          x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1());
   }
 };
 
diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp
index 98415cd034..cf611db5ca 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp
@@ -5,9 +5,8 @@
 
 #include "KokkosBatched_Util.hpp"
 
-#include "KokkosBatched_Set_Internal.hpp"
-#include "KokkosBatched_Scale_Internal.hpp"
-
+#include "KokkosBlas1_set_impl.hpp"
+#include "KokkosBlas1_team_scal_impl.hpp"
 #include "KokkosBatched_InnerMultipleDotProduct_Serial_Impl.hpp"
 
 namespace KokkosBatched {
@@ -24,6 +23,15 @@ struct TeamGemvInternal {
       const int as1, const ValueType *KOKKOS_RESTRICT x, const int xs0,
       const ScalarType beta,
       /**/ ValueType *KOKKOS_RESTRICT y, const int ys0);
+
+  template <typename MemberType, typename ScalarType, typename layout,
+            typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const int N, const int m, const int n,
+      const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0,
+      const int as1, const int as2, const ValueType *KOKKOS_RESTRICT x,
+      const int xs0, const int xs1, const ScalarType beta,
+      /**/ ValueType *KOKKOS_RESTRICT y, const int ys0, const int ys1);
 };
 
 template <>
@@ -39,9 +47,9 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal<Algo::Gemv::Unblocked>::invoke(
   // y (m), A(m x n), B(n)
 
   if (beta == zero)
-    TeamSetInternal ::invoke(member, m, zero, y, ys0);
+    KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, y, ys0);
   else if (beta != one)
-    TeamScaleInternal::invoke(member, m, beta, y, ys0);
+    KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, beta, y, ys0);
 
   if (alpha != zero) {
     if (m <= 0 || n <= 0) return 0;
@@ -78,9 +86,9 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal<Algo::Gemv::Blocked>::invoke(
   constexpr int mbAlgo = Algo::Gemv::Blocked::mb();
 
   if (beta == zero)
-    TeamSetInternal ::invoke(member, m, zero, y, ys0);
+    KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, y, ys0);
   else if (beta != one)
-    TeamScaleInternal::invoke(member, m, beta, y, ys0);
+    KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, beta, y, ys0);
 
   if (alpha != zero) {
     if (m <= 0 || n <= 0) return 0;
@@ -105,6 +113,54 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal<Algo::Gemv::Blocked>::invoke(
 
   return 0;
 }
+
+template <>
+template <typename MemberType, typename ScalarType, typename layout,
+          typename ValueType>
+KOKKOS_INLINE_FUNCTION int TeamGemvInternal<Algo::Gemv::Unblocked>::invoke(
+    const MemberType &member, const int N, const int m, const int n,
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0,
+    const int as1, const int as2, const ValueType *KOKKOS_RESTRICT X,
+    const int xs0, const int xs1, const ScalarType beta,
+    /**/ ValueType *KOKKOS_RESTRICT Y, const int ys0, const int ys1) {
+  const ScalarType one(1.0), zero(0.0);
+
+  // y_l = beta y_l + alpha A_l x_l for l in range(0, N)
+  // y_l (m), A_l(m x n), B_l(n)
+
+  if (beta == zero)
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m),
+                         [&](const int &iTemp) {
+                           int iRow, iMatrix;
+                           getIndices<int, layout>(iTemp, m, N, iRow, iMatrix);
+                           Y[ys0 * iMatrix + ys1 * iRow] = zero;
+                         });
+  else if (beta != one)
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m),
+                         [&](const int &iTemp) {
+                           int iRow, iMatrix;
+                           getIndices<int, layout>(iTemp, m, N, iRow, iMatrix);
+                           Y[ys0 * iMatrix + ys1 * iRow] *= beta;
+                         });
+
+  if (alpha != zero) {
+    if (m <= 0 || n <= 0) return 0;
+
+    if (beta != one) member.team_barrier();
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m),
+                         [&](const int &iTemp) {
+                           int iRow, iMatrix;
+                           ValueType t(0);
+                           getIndices<int, layout>(iTemp, m, N, iRow, iMatrix);
+                           for (int i = 0; i < n; ++i)
+                             t += A[as0 * iMatrix + as1 * iRow + as2 * i] *
+                                  X[xs0 * iMatrix + xs1 * i];
+                           Y[ys0 * iMatrix + ys1 * iRow] += alpha * t;
+                         });
+  }
+  return 0;
+}
 }  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp
new file mode 100644
index 0000000000..a9e10a1ebd
--- /dev/null
+++ b/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp
@@ -0,0 +1,790 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_GESV_IMPL_HPP__
+#define __KOKKOSBATCHED_GESV_IMPL_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include <KokkosBatched_LU_Decl.hpp>
+#include "KokkosBatched_Trsm_Decl.hpp"
+#include "KokkosBatched_Copy_Decl.hpp"
+
+namespace KokkosBatched {
+
+struct SerialStaticPivoting {
+  template <class MatrixType1, class MatrixType2, class VectorType1,
+            class VectorType2>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MatrixType1 A, const MatrixType2 PDAD, const VectorType1 Y,
+      const VectorType2 PDY, const VectorType2 D2, const VectorType2 tmp_v_1,
+      const VectorType2 tmp_v_2);
+};
+
+template <typename MemberType>
+struct TeamStaticPivoting {
+  template <class MatrixType1, class MatrixType2, class VectorType1,
+            class VectorType2>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD,
+      const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2,
+      const VectorType2 tmp_v_1, const VectorType2 tmp_v_2);
+};
+
+template <typename MemberType>
+struct TeamVectorStaticPivoting {
+  template <class MatrixType1, class MatrixType2, class VectorType1,
+            class VectorType2>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD,
+      const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2,
+      const VectorType2 tmp_v_1, const VectorType2 tmp_v_2);
+};
+
+template <class MatrixType1, class MatrixType2, class VectorType1,
+          class VectorType2>
+KOKKOS_INLINE_FUNCTION int SerialStaticPivoting::invoke(
+    const MatrixType1 A, const MatrixType2 PDAD, const VectorType1 Y,
+    const VectorType2 PDY, const VectorType2 D2, const VectorType2 tmp_v_1,
+    const VectorType2 tmp_v_2) {
+  using value_type = typename MatrixType1::non_const_value_type;
+  const size_t n   = A.extent(0);
+
+  // First, the algorithm loops over the rows and columns and search
+  // for the maximal absolute value per row and column.
+  for (size_t i = 0; i < n; ++i) {
+    D2(i)      = Kokkos::ArithTraits<value_type>::zero();
+    tmp_v_1(i) = 0;
+    tmp_v_2(i) = 1.;
+    for (size_t j = 0; j < n; ++j) {
+      if (D2(i) < Kokkos::abs(A(j, i))) D2(i) = Kokkos::abs(A(j, i));
+      if (tmp_v_1(i) < Kokkos::abs(A(i, j))) tmp_v_1(i) = Kokkos::abs(A(i, j));
+    }
+    D2(i) = 1. / D2(i);
+  }
+
+  // Then, the inverse of the maximal value per column is used to scale
+  // A by the right.
+  for (size_t i = 0; i < n; ++i) {
+    for (size_t j = 0; j < n; ++j) {
+      A(i, j) *= D2(j);
+    }
+  }
+
+  // Once again, the algorithm loops over the rows and store the maximal
+  // absolute value per row but after the right scalling and do a left scalling
+  // of A and Y.
+  value_type D1_i;
+  for (size_t i = 0; i < n; ++i) {
+    D1_i = Kokkos::ArithTraits<value_type>::zero();
+    for (size_t j = 0; j < n; ++j) {
+      if (D1_i < Kokkos::abs(A(i, j))) D1_i = Kokkos::abs(A(i, j));
+    }
+    D1_i = 1. / D1_i;
+    for (size_t j = 0; j < n; ++j) {
+      A(i, j) *= D1_i;
+    }
+    Y(i) *= D1_i;
+  }
+
+  // Finally, the algorithm starts to loop over the rows in an order such that
+  // their initial maximal absolute value decrease (it uses the tmp_v_1 to do
+  // so), then for a given row, it finds the available column with the largest
+  // absolute value. If this value is zero, the algorithm failed to compute a
+  // good pivot, otherwise it puts the current row to the found column index and
+  // it labels the row and column index as unavailable and continue the loop
+  // over the rows.
+  //
+  for (size_t i = 0; i < n; ++i) {
+    int row_index    = 0;
+    int col_index    = 0;
+    value_type tmp_0 = Kokkos::ArithTraits<value_type>::zero();
+    value_type tmp_1 = Kokkos::ArithTraits<value_type>::zero();
+    for (size_t j = 0; j < n; ++j) {
+      if (tmp_0 < tmp_v_1(j)) {
+        tmp_0     = tmp_v_1(j);
+        row_index = j;
+      }
+    }
+    for (size_t j = 0; j < n; ++j) {
+      if (tmp_1 < Kokkos::abs(A(row_index, j) * tmp_v_2(j))) {
+        tmp_1     = Kokkos::abs(A(row_index, j) * tmp_v_2(j));
+        col_index = j;
+      }
+    }
+    if (tmp_1 == Kokkos::ArithTraits<value_type>::zero()) return 1;
+    tmp_v_1(row_index) = Kokkos::ArithTraits<value_type>::zero();
+    tmp_v_2(col_index) = Kokkos::ArithTraits<value_type>::zero();
+
+    for (size_t j = 0; j < n; ++j) {
+      PDAD(col_index, j) = A(row_index, j);
+    }
+    PDY(col_index) = Y(row_index);
+  }
+
+  return 0;
+}
+
+template <typename MemberType>
+template <class MatrixType1, class MatrixType2, class VectorType1,
+          class VectorType2>
+KOKKOS_INLINE_FUNCTION int TeamStaticPivoting<MemberType>::invoke(
+    const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD,
+    const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2,
+    const VectorType2 tmp_v_1, const VectorType2 tmp_v_2) {
+  using value_type = typename MatrixType1::non_const_value_type;
+  using reducer_value_type =
+      typename Kokkos::MaxLoc<value_type, int>::value_type;
+  // This implementation follows the strategy of SerialStaticPivoting but uses
+  // an extra level of parallelism.
+
+  // Made this non-const in order to WORKAROUND issue #349 (Credit to C. Trott)
+  size_t n = A.extent(0);
+
+  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) {
+    D2(i)      = Kokkos::ArithTraits<value_type>::zero();
+    tmp_v_1(i) = 0;
+    tmp_v_2(i) = 1.;
+    for (size_t j = 0; j < n; ++j) {
+      if (D2(i) < Kokkos::abs(A(j, i))) D2(i) = Kokkos::abs(A(j, i));
+      if (tmp_v_1(i) < Kokkos::abs(A(i, j))) tmp_v_1(i) = Kokkos::abs(A(i, j));
+    }
+    D2(i) = 1. / D2(i);
+  });
+
+  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) {
+    for (size_t j = 0; j < n; ++j) {
+      A(i, j) *= D2(j);
+    }
+  });
+
+  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) {
+    value_type D1_i = Kokkos::ArithTraits<value_type>::zero();
+    for (size_t j = 0; j < n; ++j) {
+      if (D1_i < Kokkos::abs(A(i, j))) D1_i = Kokkos::abs(A(i, j));
+    }
+    D1_i = 1. / D1_i;
+    for (size_t j = 0; j < n; ++j) {
+      A(i, j) *= D1_i;
+    }
+    Y(i) *= D1_i;
+  });
+
+  for (size_t i = 0; i < n; ++i) {
+    int row_index, col_index;
+    reducer_value_type value;
+    Kokkos::MaxLoc<value_type, int> reducer_value(value);
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(member, n),
+        [&](const int &j, reducer_value_type &update) {
+          if (tmp_v_1(j) > update.val) {
+            update.val = tmp_v_1(j);
+            update.loc = j;
+          }
+        },
+        reducer_value);
+    row_index = value.loc;
+    value.loc = 0;
+    value.val = Kokkos::ArithTraits<value_type>::zero();
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(member, n),
+        [&](const int &j, reducer_value_type &update) {
+          if (Kokkos::abs(A(row_index, j) * tmp_v_2(j)) > update.val) {
+            update.val = Kokkos::abs(A(row_index, j) * tmp_v_2(j));
+            update.loc = j;
+          }
+        },
+        reducer_value);
+    col_index = value.loc;
+    if (value.val == Kokkos::ArithTraits<value_type>::zero()) return 1;
+    tmp_v_1(row_index) = Kokkos::ArithTraits<value_type>::zero();
+    tmp_v_2(col_index) = Kokkos::ArithTraits<value_type>::zero();
+
+    for (size_t j = 0; j < n; ++j) {
+      PDAD(col_index, j) = A(row_index, j);
+    }
+    PDY(col_index) = Y(row_index);
+  }
+  return 0;
+}
+
+template <typename MemberType>
+template <class MatrixType1, class MatrixType2, class VectorType1,
+          class VectorType2>
+KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting<MemberType>::invoke(
+    const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD,
+    const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2,
+    const VectorType2 tmp_v_1, const VectorType2 tmp_v_2) {
+  using value_type = typename MatrixType1::non_const_value_type;
+  using reducer_value_type =
+      typename Kokkos::MaxLoc<value_type, int>::value_type;
+  // This implementation follows the strategy of SerialStaticPivoting but uses
+  // two extra levels of parallelism.
+
+  const size_t n = A.extent(0);
+
+  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) {
+    D2(i)      = Kokkos::ArithTraits<value_type>::zero();
+    tmp_v_1(i) = 0;
+    tmp_v_2(i) = 1.;
+    reducer_value_type value;
+    Kokkos::MaxLoc<value_type, int> reducer_value(value);
+    Kokkos::parallel_reduce(
+        Kokkos::ThreadVectorRange(member, n),
+        [&](const int &j, reducer_value_type &update) {
+          if (Kokkos::abs(A(j, i)) > update.val) {
+            update.val = Kokkos::abs(A(j, i));
+            update.loc = j;
+          }
+        },
+        reducer_value);
+    D2(i) = 1. / value.val;
+    Kokkos::parallel_reduce(
+        Kokkos::ThreadVectorRange(member, n),
+        [&](const int &j, reducer_value_type &update) {
+          if (Kokkos::abs(A(i, j)) > update.val) {
+            update.val = Kokkos::abs(A(i, j));
+            update.loc = j;
+          }
+        },
+        reducer_value);
+    tmp_v_1(i) = value.val;
+  });
+
+  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) {
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n),
+                         [&](const int &j) { A(i, j) *= D2(j); });
+  });
+
+  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) {
+    value_type D1_i = Kokkos::ArithTraits<value_type>::zero();
+    reducer_value_type value;
+    Kokkos::MaxLoc<value_type, int> reducer_value(value);
+    Kokkos::parallel_reduce(
+        Kokkos::ThreadVectorRange(member, n),
+        [&](const int &j, reducer_value_type &update) {
+          if (Kokkos::abs(A(i, j)) > update.val) {
+            update.val = Kokkos::abs(A(i, j));
+            update.loc = j;
+          }
+        },
+        reducer_value);
+    D1_i = 1. / value.val;
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n),
+                         [&](const int &j) { A(i, j) *= D1_i; });
+    Y(i) *= D1_i;
+  });
+
+  for (size_t i = 0; i < n; ++i) {
+    int row_index, col_index;
+    reducer_value_type value;
+    Kokkos::MaxLoc<value_type, int> reducer_value(value);
+    Kokkos::parallel_reduce(
+        Kokkos::TeamVectorRange(member, n),
+        [&](const int &j, reducer_value_type &update) {
+          if (tmp_v_1(j) > update.val) {
+            update.val = tmp_v_1(j);
+            update.loc = j;
+          }
+        },
+        reducer_value);
+    row_index = value.loc;
+    value.loc = 0;
+    value.val = Kokkos::ArithTraits<value_type>::zero();
+    Kokkos::parallel_reduce(
+        Kokkos::TeamVectorRange(member, n),
+        [&](const int &j, reducer_value_type &update) {
+          if (Kokkos::abs(A(row_index, j) * tmp_v_2(j)) > update.val) {
+            update.val = Kokkos::abs(A(row_index, j) * tmp_v_2(j));
+            update.loc = j;
+          }
+        },
+        reducer_value);
+    col_index = value.loc;
+    if (value.val == Kokkos::ArithTraits<value_type>::zero()) return 1;
+    tmp_v_1(row_index) = Kokkos::ArithTraits<value_type>::zero();
+    tmp_v_2(col_index) = Kokkos::ArithTraits<value_type>::zero();
+
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) {
+      PDAD(col_index, j) = A(row_index, j);
+    });
+    PDY(col_index) = Y(row_index);
+  }
+  return 0;
+}
+
+template <class VectorType1, class VectorType2, class VectorType3>
+KOKKOS_INLINE_FUNCTION void SerialHadamard1D(const VectorType1 X,
+                                             const VectorType2 D,
+                                             const VectorType3 DX) {
+  const size_t n = X.extent(0);
+
+  for (size_t i = 0; i < n; ++i) {
+    DX(i) = D(i) * X(i);
+  }
+}
+
+template <typename MemberType, class VectorType1, class VectorType2,
+          class VectorType3>
+KOKKOS_INLINE_FUNCTION void TeamHadamard1D(const MemberType &member,
+                                           const VectorType1 X,
+                                           const VectorType2 D,
+                                           const VectorType3 DX) {
+  const size_t n = X.extent(0);
+
+  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n),
+                       [&](const size_t &i) { DX(i) = D(i) * X(i); });
+}
+
+template <typename MemberType, class VectorType1, class VectorType2,
+          class VectorType3>
+KOKKOS_INLINE_FUNCTION void TeamVectorHadamard1D(const MemberType &member,
+                                                 const VectorType1 X,
+                                                 const VectorType2 D,
+                                                 const VectorType3 DX) {
+  const size_t n = X.extent(0);
+
+  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n),
+                       [&](const size_t &i) { DX(i) = D(i) * X(i); });
+}
+
+///
+/// Serial Impl
+/// ===========
+template <>
+struct SerialGesv<Gesv::StaticPivoting> {
+  template <typename MatrixType, typename VectorType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A,
+                                           const VectorType X,
+                                           const VectorType Y,
+                                           const MatrixType tmp) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    static_assert(Kokkos::is_view<MatrixType>::value,
+                  "KokkosBatched::gesv: MatrixType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<VectorType>::value,
+                  "KokkosBatched::gesv: VectorType is not a Kokkos::View.");
+    static_assert(MatrixType::Rank == 2,
+                  "KokkosBatched::gesv: MatrixType must have rank 2.");
+    static_assert(VectorType::Rank == 1,
+                  "KokkosBatched::gesv: VectorType must have rank 1.");
+
+    // Check compatibility of dimensions at run time.
+
+    if (A.extent(0) != tmp.extent(0) || A.extent(1) + 4 != tmp.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::gesv: dimensions of A and tmp do not match: A: "
+          "%d x %d, tmp (note: its second dimension should be the second "
+          "dimension of A + 4): %d x %d\n",
+          (int)A.extent(0), (int)A.extent(1), (int)tmp.extent(0),
+          (int)tmp.extent(1));
+      return 1;
+    }
+
+    if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) ||
+        A.extent(0) != Y.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: "
+          "%d x %d, X: %d, Y: %d\n",
+          (int)A.extent(0), (int)A.extent(1), (int)X.extent(0),
+          (int)Y.extent(0));
+      return 1;
+    }
+#endif
+
+    const int n = A.extent(0);
+
+    auto PDAD    = Kokkos::subview(tmp, Kokkos::ALL, Kokkos::make_pair(0, n));
+    auto PDY     = Kokkos::subview(tmp, Kokkos::ALL, n);
+    auto D2      = Kokkos::subview(tmp, Kokkos::ALL, n + 1);
+    auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2);
+    auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3);
+
+    if (SerialStaticPivoting::invoke(A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) ==
+        1) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::gesv: the currently implemented static pivoting "
+          "failed.\n");
+      return 1;
+    }
+
+    int r_val = SerialLU<Algo::Level3::Unblocked>::invoke(PDAD);
+
+    if (r_val == 0)
+      r_val =
+          SerialTrsm<Side::Left, Uplo::Lower, Trans::NoTranspose, Diag::Unit,
+                     Algo::Level3::Unblocked>::invoke(1.0, PDAD, PDY);
+
+    if (r_val == 0)
+      r_val =
+          SerialTrsm<Side::Left, Uplo::Upper, Trans::NoTranspose, Diag::NonUnit,
+                     Algo::Level3::Unblocked>::invoke(1.0, PDAD, PDY);
+
+    if (r_val == 0) SerialHadamard1D(PDY, D2, X);
+    return r_val;
+  }
+};
+
+template <>
+struct SerialGesv<Gesv::NoPivoting> {
+  template <typename MatrixType, typename VectorType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A,
+                                           const VectorType X,
+                                           const VectorType Y,
+                                           const MatrixType /*tmp*/) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    static_assert(Kokkos::is_view<MatrixType>::value,
+                  "KokkosBatched::gesv: MatrixType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<VectorType>::value,
+                  "KokkosBatched::gesv: VectorType is not a Kokkos::View.");
+    static_assert(MatrixType::Rank == 2,
+                  "KokkosBatched::gesv: MatrixType must have rank 2.");
+    static_assert(VectorType::Rank == 1,
+                  "KokkosBatched::gesv: VectorType must have rank 1.");
+
+    // Check compatibility of dimensions at run time.
+
+    if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) ||
+        A.extent(0) != Y.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: "
+          "%d x %d, X: %d, Y: %d\n",
+          (int)A.extent(0), (int)A.extent(1), (int)X.extent(0),
+          (int)Y.extent(0));
+      return 1;
+    }
+#endif
+
+    int r_val = SerialLU<Algo::Level3::Unblocked>::invoke(A);
+
+    if (r_val == 0) r_val = SerialCopy<Trans::NoTranspose, 1>::invoke(Y, X);
+
+    if (r_val == 0)
+      r_val =
+          SerialTrsm<Side::Left, Uplo::Lower, Trans::NoTranspose, Diag::Unit,
+                     Algo::Level3::Unblocked>::invoke(1.0, A, X);
+
+    if (r_val == 0)
+      r_val =
+          SerialTrsm<Side::Left, Uplo::Upper, Trans::NoTranspose, Diag::NonUnit,
+                     Algo::Level3::Unblocked>::invoke(1.0, A, X);
+
+    return r_val;
+  }
+};
+
+///
+/// Team Impl
+/// =========
+
+template <typename MemberType>
+struct TeamGesv<MemberType, Gesv::StaticPivoting> {
+  template <typename MatrixType, typename VectorType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const MatrixType A,
+                                           const VectorType X,
+                                           const VectorType Y) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    static_assert(Kokkos::is_view<MatrixType>::value,
+                  "KokkosBatched::gesv: MatrixType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<VectorType>::value,
+                  "KokkosBatched::gesv: VectorType is not a Kokkos::View.");
+    static_assert(MatrixType::Rank == 2,
+                  "KokkosBatched::gesv: MatrixType must have rank 2.");
+    static_assert(VectorType::Rank == 1,
+                  "KokkosBatched::gesv: VectorType must have rank 1.");
+
+    // Check compatibility of dimensions at run time.
+    if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) ||
+        A.extent(0) != Y.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: "
+          "%d x %d, X: %d, Y: %d\n",
+          (int)A.extent(0), (int)A.extent(1), (int)X.extent(0),
+          (int)Y.extent(0));
+      return 1;
+    }
+#endif
+    using ScratchPadMatrixViewType = Kokkos::View<
+        typename MatrixType::non_const_value_type **,
+        typename MatrixType::execution_space::scratch_memory_space>;
+
+    const int n = A.extent(0);
+
+    ScratchPadMatrixViewType tmp(member.team_scratch(0), n, n + 4);
+    auto PDAD    = Kokkos::subview(tmp, Kokkos::ALL, Kokkos::make_pair(0, n));
+    auto PDY     = Kokkos::subview(tmp, Kokkos::ALL, n);
+    auto D2      = Kokkos::subview(tmp, Kokkos::ALL, n + 1);
+    auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2);
+    auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3);
+
+    if (TeamStaticPivoting<MemberType>::invoke(member, A, PDAD, Y, PDY, D2,
+                                               tmp_v_1, tmp_v_2) == 1) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::gesv: the currently implemented static pivoting "
+          "failed.\n");
+      return 1;
+    }
+    member.team_barrier();
+
+    int r_val =
+        TeamLU<MemberType, Algo::Level3::Unblocked>::invoke(member, PDAD);
+    member.team_barrier();
+
+    if (r_val == 0) {
+      r_val = TeamTrsm<MemberType, Side::Left, Uplo::Lower, Trans::NoTranspose,
+                       Diag::Unit, Algo::Level3::Unblocked>::invoke(member, 1.0,
+                                                                    PDAD, PDY);
+      member.team_barrier();
+    }
+
+    if (r_val == 0) {
+      r_val =
+          TeamTrsm<MemberType, Side::Left, Uplo::Upper, Trans::NoTranspose,
+                   Diag::NonUnit, Algo::Level3::Unblocked>::invoke(member, 1.0,
+                                                                   PDAD, PDY);
+      member.team_barrier();
+    }
+
+    if (r_val == 0) {
+      TeamHadamard1D(member, PDY, D2, X);
+      member.team_barrier();
+    }
+
+    return r_val;
+  }
+};
+
+template <typename MemberType>
+struct TeamGesv<MemberType, Gesv::NoPivoting> {
+  template <typename MatrixType, typename VectorType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const MatrixType A,
+                                           const VectorType X,
+                                           const VectorType Y) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    static_assert(Kokkos::is_view<MatrixType>::value,
+                  "KokkosBatched::gesv: MatrixType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<VectorType>::value,
+                  "KokkosBatched::gesv: VectorType is not a Kokkos::View.");
+    static_assert(MatrixType::Rank == 2,
+                  "KokkosBatched::gesv: MatrixType must have rank 2.");
+    static_assert(VectorType::Rank == 1,
+                  "KokkosBatched::gesv: VectorType must have rank 1.");
+
+    // Check compatibility of dimensions at run time.
+    if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) ||
+        A.extent(0) != Y.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: "
+          "%d x %d, X: %d, Y: %d\n",
+          (int)A.extent(0), (int)A.extent(1), (int)X.extent(0),
+          (int)Y.extent(0));
+      return 1;
+    }
+#endif
+
+    int r_val = TeamLU<MemberType, Algo::Level3::Unblocked>::invoke(member, A);
+    member.team_barrier();
+
+    if (r_val == 0) {
+      TeamCopy<MemberType, Trans::NoTranspose, 1>::invoke(member, Y, X);
+      member.team_barrier();
+    }
+
+    if (r_val == 0) {
+      TeamTrsm<MemberType, Side::Left, Uplo::Lower, Trans::NoTranspose,
+               Diag::Unit, Algo::Level3::Unblocked>::invoke(member, 1.0, A, X);
+      member.team_barrier();
+    }
+
+    if (r_val == 0) {
+      TeamTrsm<MemberType, Side::Left, Uplo::Upper, Trans::NoTranspose,
+               Diag::NonUnit, Algo::Level3::Unblocked>::invoke(member, 1.0, A,
+                                                               X);
+      member.team_barrier();
+    }
+
+    return r_val;
+  }
+};
+
+///
+/// TeamVector Impl
+/// =========
+
+template <typename MemberType>
+struct TeamVectorGesv<MemberType, Gesv::StaticPivoting> {
+  template <typename MatrixType, typename VectorType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const MatrixType A,
+                                           const VectorType X,
+                                           const VectorType Y) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    static_assert(Kokkos::is_view<MatrixType>::value,
+                  "KokkosBatched::gesv: MatrixType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<VectorType>::value,
+                  "KokkosBatched::gesv: VectorType is not a Kokkos::View.");
+    static_assert(MatrixType::Rank == 2,
+                  "KokkosBatched::gesv: MatrixType must have rank 2.");
+    static_assert(VectorType::Rank == 1,
+                  "KokkosBatched::gesv: VectorType must have rank 1.");
+
+    // Check compatibility of dimensions at run time.
+    if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) ||
+        A.extent(0) != Y.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: "
+          "%d x %d, X: %d, Y: %d\n",
+          (int)A.extent(0), (int)A.extent(1), (int)X.extent(0),
+          (int)Y.extent(0));
+      return 1;
+    }
+#endif
+    using ScratchPadMatrixViewType = Kokkos::View<
+        typename MatrixType::non_const_value_type **,
+        typename MatrixType::execution_space::scratch_memory_space>;
+
+    const int n = A.extent(0);
+
+    ScratchPadMatrixViewType tmp(member.team_scratch(0), n, n + 4);
+    auto PDAD    = Kokkos::subview(tmp, Kokkos::ALL, Kokkos::make_pair(0, n));
+    auto PDY     = Kokkos::subview(tmp, Kokkos::ALL, n);
+    auto D2      = Kokkos::subview(tmp, Kokkos::ALL, n + 1);
+    auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2);
+    auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3);
+
+    if (TeamVectorStaticPivoting<MemberType>::invoke(
+            member, A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::gesv: the currently implemented static pivoting "
+          "failed.\n");
+      return 1;
+    }
+
+    member.team_barrier();
+
+    int r_val =
+        TeamLU<MemberType, Algo::Level3::Unblocked>::invoke(member, PDAD);
+    member.team_barrier();
+
+    if (r_val == 0) {
+      TeamVectorTrsm<MemberType, Side::Left, Uplo::Lower, Trans::NoTranspose,
+                     Diag::Unit, Algo::Level3::Unblocked>::invoke(member, 1.0,
+                                                                  PDAD, PDY);
+      member.team_barrier();
+    }
+
+    if (r_val == 0) {
+      TeamVectorTrsm<MemberType, Side::Left, Uplo::Upper, Trans::NoTranspose,
+                     Diag::NonUnit, Algo::Level3::Unblocked>::invoke(member,
+                                                                     1.0, PDAD,
+                                                                     PDY);
+      member.team_barrier();
+    }
+
+    if (r_val == 0) {
+      TeamVectorHadamard1D(member, PDY, D2, X);
+      member.team_barrier();
+    }
+
+    return r_val;
+  }
+};
+
+template <typename MemberType>
+struct TeamVectorGesv<MemberType, Gesv::NoPivoting> {
+  template <typename MatrixType, typename VectorType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const MatrixType A,
+                                           const VectorType X,
+                                           const VectorType Y) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    static_assert(Kokkos::is_view<MatrixType>::value,
+                  "KokkosBatched::gesv: MatrixType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<VectorType>::value,
+                  "KokkosBatched::gesv: VectorType is not a Kokkos::View.");
+    static_assert(MatrixType::Rank == 2,
+                  "KokkosBatched::gesv: MatrixType must have rank 2.");
+    static_assert(VectorType::Rank == 1,
+                  "KokkosBatched::gesv: VectorType must have rank 1.");
+
+    // Check compatibility of dimensions at run time.
+    if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) ||
+        A.extent(0) != Y.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: "
+          "%d x %d, X: %d, Y: %d\n",
+          (int)A.extent(0), (int)A.extent(1), (int)X.extent(0),
+          (int)Y.extent(0));
+      return 1;
+    }
+#endif
+
+    int r_val = TeamLU<MemberType, Algo::Level3::Unblocked>::invoke(member, A);
+    member.team_barrier();
+
+    if (r_val == 0) {
+      TeamVectorCopy<MemberType, Trans::NoTranspose, 1>::invoke(member, Y, X);
+      member.team_barrier();
+    }
+
+    if (r_val == 0) {
+      TeamVectorTrsm<MemberType, Side::Left, Uplo::Lower, Trans::NoTranspose,
+                     Diag::Unit, Algo::Level3::Unblocked>::invoke(member, 1.0,
+                                                                  A, X);
+      member.team_barrier();
+    }
+
+    if (r_val == 0) {
+      TeamVectorTrsm<MemberType, Side::Left, Uplo::Upper, Trans::NoTranspose,
+                     Diag::NonUnit, Algo::Level3::Unblocked>::invoke(member,
+                                                                     1.0, A, X);
+      member.team_barrier();
+    }
+
+    return r_val;
+  }
+};
+
+}  // namespace KokkosBatched
+
+#endif
diff --git a/src/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp
index 58cd9bad2d..4c0f39097f 100644
--- a/src/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp
@@ -4,7 +4,7 @@
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
-#include "KokkosBatched_Set_Internal.hpp"
+#include "KokkosBlas1_set_impl.hpp"
 #include "KokkosBatched_SetIdentity_Internal.hpp"
 #include "KokkosBatched_ApplyQ_Serial_Internal.hpp"
 
@@ -37,7 +37,8 @@ struct SerialHessenbergFormQInternal {
     ///   B is m x m
     // set identity
     if (is_Q_zero)
-      SerialSetInternal::invoke(m, value_type(1), Q, qs0 + qs1);
+      KokkosBlas::Impl::SerialSetInternal::invoke(m, value_type(1), Q,
+                                                  qs0 + qs1);
     else
       SerialSetIdentityInternal::invoke(m, Q, qs0, qs1);
 
diff --git a/src/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp
index 46feefb91b..23171c063e 100644
--- a/src/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp
@@ -4,7 +4,7 @@
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
-#include "KokkosBatched_Set_Internal.hpp"
+#include "KokkosBlas1_set_impl.hpp"
 #include "KokkosBatched_SetIdentity_Internal.hpp"
 #include "KokkosBatched_ApplyQ_Serial_Internal.hpp"
 
diff --git a/src/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp
index 52178a095a..13a4ef4636 100644
--- a/src/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp
@@ -4,7 +4,7 @@
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
-#include "KokkosBatched_Set_Internal.hpp"
+#include "KokkosBlas1_set_impl.hpp"
 #include "KokkosBatched_SetIdentity_Internal.hpp"
 #include "KokkosBatched_ApplyQ_TeamVector_Internal.hpp"
 
@@ -36,7 +36,8 @@ struct TeamVectorQR_FormQ_Internal {
 
     // set identity
     if (is_Q_zero)
-      TeamVectorSetInternal::invoke(member, m, value_type(1), Q, qs0 + qs1);
+      KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, value_type(1),
+                                                      Q, qs0 + qs1);
     else
       TeamVectorSetIdentityInternal::invoke(member, m, n, Q, qs0, qs1);
     member.team_barrier();
diff --git a/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp
index 0c7007bdf3..446ba50c03 100644
--- a/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp
@@ -37,10 +37,15 @@ struct SerialSVDInternal {
   KOKKOS_INLINE_FUNCTION static void symEigen2x2(value_type a11, value_type a21,
                                                  value_type a22, value_type& e1,
                                                  value_type& e2) {
-    value_type a       = Kokkos::ArithTraits<value_type>::one();
-    value_type b       = -a11 - a22;
-    value_type c       = a11 * a22 - a21 * a21;
-    value_type sqrtDet = Kokkos::Experimental::sqrt(b * b - 4 * a * c);
+    value_type a = Kokkos::ArithTraits<value_type>::one();
+    value_type b = -a11 - a22;
+    value_type c = a11 * a22 - a21 * a21;
+#if KOKKOS_VERSION >= 30699
+    using Kokkos::sqrt;
+#else
+    using Kokkos::Experimental::sqrt;
+#endif
+    value_type sqrtDet = sqrt(b * b - 4 * a * c);
     e1                 = (-b + sqrtDet) / (2 * a);
     e2                 = (-b - sqrtDet) / (2 * a);
   }
diff --git a/src/batched/dense/impl/KokkosBatched_Scale_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Scale_Impl.hpp
deleted file mode 100644
index b4e865ddea..0000000000
--- a/src/batched/dense/impl/KokkosBatched_Scale_Impl.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef __KOKKOSBATCHED_SCALE_IMPL_HPP__
-#define __KOKKOSBATCHED_SCALE_IMPL_HPP__
-
-/// \author Kyungjoo Kim (kyukim@sandia.gov)
-
-#include "KokkosBatched_Util.hpp"
-#include "KokkosBatched_Scale_Internal.hpp"
-
-namespace KokkosBatched {
-
-///
-/// Serial Impl
-/// ===========
-template <typename ScalarType, typename AViewType>
-KOKKOS_INLINE_FUNCTION int SerialScale::invoke(const ScalarType alpha,
-                                               const AViewType &A) {
-  return SerialScaleInternal::invoke(A.extent(0), A.extent(1), alpha, A.data(),
-                                     A.stride_0(), A.stride_1());
-}
-
-///
-/// Team Impl
-/// =========
-
-template <typename MemberType>
-template <typename ScalarType, typename AViewType>
-KOKKOS_INLINE_FUNCTION int TeamScale<MemberType>::invoke(
-    const MemberType &member, const ScalarType alpha, const AViewType &A) {
-  return TeamScaleInternal::invoke(member, A.extent(0), A.extent(1), alpha,
-                                   A.data(), A.stride_0(), A.stride_1());
-}
-
-///
-/// TeamVector Impl
-/// ===============
-
-template <typename MemberType>
-template <typename ScalarType, typename AViewType>
-KOKKOS_INLINE_FUNCTION int TeamVectorScale<MemberType>::invoke(
-    const MemberType &member, const ScalarType alpha, const AViewType &A) {
-  return TeamVectorScaleInternal::invoke(member, A.extent(0), A.extent(1),
-                                         alpha, A.data(), A.stride_0(),
-                                         A.stride_1());
-}
-
-}  // namespace KokkosBatched
-
-#endif
diff --git a/src/batched/dense/impl/KokkosBatched_Set_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Set_Impl.hpp
deleted file mode 100644
index 148e051ce4..0000000000
--- a/src/batched/dense/impl/KokkosBatched_Set_Impl.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef __KOKKOSBATCHED_SET_IMPL_HPP__
-#define __KOKKOSBATCHED_SET_IMPL_HPP__
-
-/// \author Kyungjoo Kim (kyukim@sandia.gov)
-
-#include "KokkosBatched_Util.hpp"
-#include "KokkosBatched_Set_Internal.hpp"
-
-namespace KokkosBatched {
-
-///
-/// Serial Impl
-/// ===========
-
-template <typename ScalarType, typename AViewType>
-KOKKOS_INLINE_FUNCTION int SerialSet::invoke(const ScalarType alpha,
-                                             const AViewType &A) {
-  return SerialSetInternal::invoke(A.extent(0), A.extent(1), alpha, A.data(),
-                                   A.stride_0(), A.stride_1());
-}
-
-///
-/// Team Impl
-/// =========
-
-template <typename MemberType>
-template <typename ScalarType, typename AViewType>
-KOKKOS_INLINE_FUNCTION int TeamSet<MemberType>::invoke(const MemberType &member,
-                                                       const ScalarType alpha,
-                                                       const AViewType &A) {
-  return TeamSetInternal::invoke(member, A.extent(0), A.extent(1), alpha,
-                                 A.data(), A.stride_0(), A.stride_1());
-}
-
-///
-/// TeamVector Impl
-/// ===============
-
-template <typename MemberType>
-template <typename ScalarType, typename AViewType>
-KOKKOS_INLINE_FUNCTION int TeamVectorSet<MemberType>::invoke(
-    const MemberType &member, const ScalarType alpha, const AViewType &A) {
-  return TeamVectorSetInternal::invoke(member, A.extent(0), A.extent(1), alpha,
-                                       A.data(), A.stride_0(), A.stride_1());
-}
-}  // end namespace KokkosBatched
-
-#endif
diff --git a/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp
index b0e2ea5b80..c6aec99d18 100644
--- a/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp
@@ -5,9 +5,8 @@
 
 #include "KokkosBatched_Util.hpp"
 
-#include "KokkosBatched_Set_Internal.hpp"
-#include "KokkosBatched_Scale_Internal.hpp"
-
+#include "KokkosBlas1_set_impl.hpp"
+#include "KokkosBlas1_team_scal_impl.hpp"
 #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp"
 #include "KokkosBatched_Gemv_Serial_Internal.hpp"
 
diff --git a/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp
index 9b5cc055e3..ac53992064 100644
--- a/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp
@@ -47,8 +47,8 @@
 
 #include "KokkosBatched_Util.hpp"
 
-#include "KokkosBatched_Set_Internal.hpp"
-#include "KokkosBatched_Scale_Internal.hpp"
+#include "KokkosBlas1_set_impl.hpp"
+#include "KokkosBlas1_serial_scal_impl.hpp"
 
 namespace KokkosBatched {
 
@@ -152,9 +152,10 @@ SerialTrmmInternalLeftLower<Algo::Trmm::Unblocked>::invoke(
   if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0) return 0;
 
   if (alpha == zero)
-    SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1);
+    KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1);
   else {
-    if (alpha != one) SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1);
+    if (alpha != one)
+      KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
@@ -240,9 +241,10 @@ SerialTrmmInternalRightLower<Algo::Trmm::Unblocked>::invoke(
   if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0) return 0;
 
   if (alpha == zero)
-    SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1);
+    KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1);
   else {
-    if (alpha != one) SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1);
+    if (alpha != one)
+      KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
@@ -321,9 +323,10 @@ SerialTrmmInternalLeftUpper<Algo::Trmm::Unblocked>::invoke(
   if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0) return 0;
 
   if (alpha == zero)
-    SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1);
+    KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1);
   else {
-    if (alpha != one) SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1);
+    if (alpha != one)
+      KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
@@ -401,9 +404,10 @@ SerialTrmmInternalRightUpper<Algo::Trmm::Unblocked>::invoke(
   if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0) return 0;
 
   if (alpha == zero)
-    SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1);
+    KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1);
   else {
-    if (alpha != one) SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1);
+    if (alpha != one)
+      KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp
index b317bed4f7..b29b54931f 100644
--- a/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp
@@ -5,9 +5,8 @@
 
 #include "KokkosBatched_Util.hpp"
 
-#include "KokkosBatched_Set_Internal.hpp"
-#include "KokkosBatched_Scale_Internal.hpp"
-
+#include "KokkosBlas1_set_impl.hpp"
+#include "KokkosBlas1_serial_scal_impl.hpp"
 #include "KokkosBatched_InnerGemmFixA_Serial_Impl.hpp"
 #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp"
 
@@ -39,9 +38,10 @@ SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
   const ScalarType one(1.0), zero(0.0);
 
   if (alpha == zero)
-    SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1);
+    KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1);
   else {
-    if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
+    if (alpha != one)
+      KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
     if (m <= 0 || n <= 0) return 0;
 
     for (int p = 0; p < m; ++p) {
@@ -87,9 +87,10 @@ SerialTrsmInternalLeftLower<Algo::Trsm::Blocked>::invoke(
   const ScalarType one(1.0), zero(0.0), minus_one(-1.0);
 
   if (alpha == zero)
-    SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1);
+    KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1);
   else {
-    if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
+    if (alpha != one)
+      KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
     if (m <= 0 || n <= 0) return 0;
 
     InnerTrsmLeftLowerUnitDiag<mbAlgo> trsm_u(as0, as1, bs0, bs1);
@@ -154,9 +155,10 @@ SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
   const ScalarType one(1.0), zero(0.0);
 
   if (alpha == zero)
-    SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1);
+    KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1);
   else {
-    if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
+    if (alpha != one)
+      KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
     if (m <= 0 || n <= 0) return 0;
 
     ValueType *KOKKOS_RESTRICT B0 = B;
@@ -202,9 +204,10 @@ SerialTrsmInternalLeftUpper<Algo::Trsm::Blocked>::invoke(
   constexpr int mbAlgo = Algo::Trsm::Blocked::mb();
 
   if (alpha == zero)
-    SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1);
+    KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1);
   else {
-    if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
+    if (alpha != one)
+      KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
     if (m <= 0 || n <= 0) return 0;
 
     InnerTrsmLeftUpperUnitDiag<mbAlgo> trsm_u(as0, as1, bs0, bs1);
diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp
index 0afa92ae6e..08819e8c18 100644
--- a/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp
@@ -5,8 +5,8 @@
 
 #include "KokkosBatched_Util.hpp"
 
-#include "KokkosBatched_Set_Internal.hpp"
-#include "KokkosBatched_Scale_Internal.hpp"
+#include "KokkosBlas1_set_impl.hpp"
+#include "KokkosBlas1_team_scal_impl.hpp"
 
 namespace KokkosBatched {
 
@@ -35,10 +35,12 @@ TeamVectorTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
   const ScalarType one(1.0), zero(0.0);
 
   if (alpha == zero)
-    TeamVectorSetInternal ::invoke(member, m, n, zero, B, bs0, bs1);
+    KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, B, bs0,
+                                                    bs1);
   else {
     if (alpha != one)
-      TeamVectorScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1);
+      KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, alpha, B,
+                                                        bs0, bs1);
     if (m <= 0 || n <= 0) return 0;
 
     for (int p = 0; p < m; ++p) {
@@ -96,10 +98,12 @@ TeamVectorTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
 
   // note that parallel range is different ( m*n vs m-1*n);
   if (alpha == zero)
-    TeamVectorSetInternal ::invoke(member, m, n, zero, B, bs0, bs1);
+    KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, B, bs0,
+                                                    bs1);
   else {
     if (alpha != one)
-      TeamVectorScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1);
+      KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, alpha, B,
+                                                        bs0, bs1);
     if (m <= 0 || n <= 0) return 0;
 
     ValueType *KOKKOS_RESTRICT B0 = B;
diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp
index 37e5051675..f9e2bed8f8 100644
--- a/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp
@@ -6,9 +6,8 @@
 #include "KokkosBatched_Util.hpp"
 #include "KokkosKernels_ExecSpaceUtils.hpp"
 
-#include "KokkosBatched_Set_Internal.hpp"
-#include "KokkosBatched_Scale_Internal.hpp"
-
+#include "KokkosBlas1_set_impl.hpp"
+#include "KokkosBlas1_team_scal_impl.hpp"
 #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp"
 #include "KokkosBatched_Gemm_Team_Internal.hpp"
 
@@ -39,10 +38,11 @@ TeamTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
   const ScalarType one(1.0), zero(0.0);
 
   if (alpha == zero)
-    TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1);
+    KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1);
   else {
     if (alpha != one)
-      TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1);
+      KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0,
+                                                  bs1);
     if (m <= 0 || n <= 0) return 0;
 
     for (int p = 0; p < m; ++p) {
@@ -90,10 +90,11 @@ TeamTrsmInternalLeftLower<Algo::Trsm::Blocked>::invoke(
 
   // note that parallel range is different ( m*n vs m-1*n);
   if (alpha == zero)
-    TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1);
+    KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1);
   else {
     if (alpha != one)
-      TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1);
+      KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0,
+                                                  bs1);
     if (m <= 0 || n <= 0) return 0;
 
     ///
@@ -173,10 +174,11 @@ TeamTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
 
   // note that parallel range is different ( m*n vs m-1*n);
   if (alpha == zero)
-    TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1);
+    KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1);
   else {
     if (alpha != one)
-      TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1);
+      KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0,
+                                                  bs1);
     if (m <= 0 || n <= 0) return 0;
 
     ValueType *KOKKOS_RESTRICT B0 = B;
@@ -229,10 +231,11 @@ TeamTrsmInternalLeftUpper<Algo::Trsm::Blocked>::invoke(
 
   // note that parallel range is different ( m*n vs m-1*n);
   if (alpha == zero)
-    TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1);
+    KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1);
   else {
     if (alpha != one)
-      TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1);
+      KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0,
+                                                  bs1);
     if (m <= 0 || n <= 0) return 0;
 
     InnerTrsmLeftUpperUnitDiag<mbAlgo> trsm_u(as0, as1, bs0, bs1);
diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp
index fb28ea5a9c..926003083a 100644
--- a/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp
@@ -5,9 +5,8 @@
 
 #include "KokkosBatched_Util.hpp"
 
-#include "KokkosBatched_Set_Internal.hpp"
-#include "KokkosBatched_Scale_Internal.hpp"
-
+#include "KokkosBlas1_set_impl.hpp"
+#include "KokkosBlas1_serial_scal_impl.hpp"
 #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp"
 #include "KokkosBatched_Gemv_Serial_Internal.hpp"
 
@@ -42,9 +41,10 @@ SerialTrsvInternalLower<Algo::Trsv::Unblocked>::invoke(
   const ScalarType one(1.0), zero(0.0);
 
   if (alpha == zero)
-    SerialSetInternal::invoke(m, zero, b, bs0);
+    KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0);
   else {
-    if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0);
+    if (alpha != one)
+      KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0);
     if (m <= 0) return 0;
 
     for (int p = 0; p < m; ++p) {
@@ -79,9 +79,10 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower<Algo::Trsv::Blocked>::invoke(
   constexpr int mbAlgo = Algo::Trsv::Blocked::mb();
 
   if (alpha == zero)
-    SerialSetInternal::invoke(m, zero, b, bs0);
+    KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0);
   else {
-    if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0);
+    if (alpha != one)
+      KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0);
     if (m <= 0) return 0;
 
     /// case GPU: team size is large and blocksize (mb,nb) is small
@@ -135,9 +136,10 @@ SerialTrsvInternalUpper<Algo::Trsv::Unblocked>::invoke(
   const ScalarType one(1.0), zero(0.0);
 
   if (alpha == zero)
-    SerialSetInternal::invoke(m, zero, b, bs0);
+    KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0);
   else {
-    if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0);
+    if (alpha != one)
+      KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0);
     if (m <= 0) return 0;
 
     ValueType *KOKKOS_RESTRICT b0 = b;
@@ -170,9 +172,10 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalUpper<Algo::Trsv::Blocked>::invoke(
 
   // note that parallel range is different ( m*n vs m-1*n);
   if (alpha == zero)
-    SerialSetInternal::invoke(m, zero, b, bs0);
+    KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0);
   else {
-    if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0);
+    if (alpha != one)
+      KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0);
     if (m <= 0) return 0;
 
     InnerTrsmLeftUpperUnitDiag<mbAlgo> trsm_u(as0, as1, bs0, 0);
diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp
index ad50e6fc2a..b0da8f1f2d 100644
--- a/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp
@@ -5,8 +5,8 @@
 
 #include "KokkosBatched_Util.hpp"
 
-#include "KokkosBatched_Set_Internal.hpp"
-#include "KokkosBatched_Scale_Internal.hpp"
+#include "KokkosBlas1_set_impl.hpp"
+#include "KokkosBlas1_team_scal_impl.hpp"
 
 namespace KokkosBatched {
 
@@ -43,9 +43,11 @@ TeamVectorTrsvInternalLower<Algo::Trsv::Unblocked>::invoke(
   const ScalarType one(1.0), zero(0.0);
 
   if (alpha == zero)
-    TeamVectorSetInternal::invoke(member, m, zero, b, bs0);
+    KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, zero, b, bs0);
   else {
-    if (alpha != one) TeamVectorScaleInternal::invoke(member, m, alpha, b, bs0);
+    if (alpha != one)
+      KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, alpha, b,
+                                                        bs0);
     if (m <= 0) return 0;
 
     for (int p = 0; p < m; ++p) {
@@ -105,9 +107,11 @@ TeamVectorTrsvInternalUpper<Algo::Trsv::Unblocked>::invoke(
   const ScalarType one(1.0), zero(0.0);
 
   if (alpha == zero)
-    TeamVectorSetInternal::invoke(member, m, zero, b, bs0);
+    KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, zero, b, bs0);
   else {
-    if (alpha != one) TeamVectorScaleInternal::invoke(member, m, alpha, b, bs0);
+    if (alpha != one)
+      KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, alpha, b,
+                                                        bs0);
     if (m <= 0) return 0;
 
     ValueType *KOKKOS_RESTRICT b0 = b;
diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp
index 60b941e1ba..aaf72e9876 100644
--- a/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp
@@ -5,9 +5,8 @@
 
 #include "KokkosBatched_Util.hpp"
 
-#include "KokkosBatched_Set_Internal.hpp"
-#include "KokkosBatched_Scale_Internal.hpp"
-
+#include "KokkosBlas1_set_impl.hpp"
+#include "KokkosBlas1_team_scal_impl.hpp"
 #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp"
 #include "KokkosBatched_Gemv_Team_Internal.hpp"
 
@@ -45,9 +44,10 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower<Algo::Trsv::Unblocked>::invoke(
   const ScalarType one(1.0), zero(0.0);
 
   if (alpha == zero)
-    TeamSetInternal::invoke(member, m, zero, b, bs0);
+    KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0);
   else {
-    if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0);
+    if (alpha != one)
+      KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0);
     if (m <= 0) return 0;
 
     for (int p = 0; p < m; ++p) {
@@ -90,9 +90,10 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower<Algo::Trsv::Blocked>::invoke(
   constexpr int mbAlgo = Algo::Trsv::Blocked::mb();
 
   if (alpha == zero)
-    TeamSetInternal::invoke(member, m, zero, b, bs0);
+    KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0);
   else {
-    if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0);
+    if (alpha != one)
+      KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0);
     if (m <= 0) return 0;
 
     /// case GPU: team size is large and blocksize (mb,nb) is small
@@ -154,9 +155,10 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper<Algo::Trsv::Unblocked>::invoke(
   const ScalarType one(1.0), zero(0.0);
 
   if (alpha == zero)
-    TeamSetInternal::invoke(member, m, zero, b, bs0);
+    KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0);
   else {
-    if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0);
+    if (alpha != one)
+      KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0);
     if (m <= 0) return 0;
 
     ValueType *KOKKOS_RESTRICT b0 = b;
@@ -197,9 +199,10 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper<Algo::Trsv::Blocked>::invoke(
 
   // note that parallel range is different ( m*n vs m-1*n);
   if (alpha == zero)
-    TeamSetInternal::invoke(member, m, zero, b, bs0);
+    KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0);
   else {
-    if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0);
+    if (alpha != one)
+      KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0);
     if (m <= 0) return 0;
 
     InnerTrsmLeftUpperUnitDiag<mbAlgo> trsm_u(as0, as1, bs0, 0);
diff --git a/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp
index ee14040aed..8c8af6cbd5 100644
--- a/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp
@@ -108,8 +108,8 @@ SerialTrtriInternalLower<Algo::Trtri::Unblocked>::invoke(
 
       // SCAL -- x=ax
       // A((j+1):n,j) = A_ii * A((j+1):n,j)
-      SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, A_ii, A_col_vec,
-                                  as0, as1);
+      KokkosBlas::Impl::SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n,
+                                                    A_ii, A_col_vec, as0, as1);
     }
   }
   return 0;
@@ -157,8 +157,8 @@ SerialTrtriInternalUpper<Algo::Trtri::Unblocked>::invoke(
 
       // SCAL -- x=ax
       // A((j+1):n,j) = A_ii * A((j+1):n,j)
-      SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, A_ii, A_col_vec,
-                                  as0, as1);
+      KokkosBlas::Impl::SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n,
+                                                    A_ii, A_col_vec, as0, as1);
     }
   }
   return 0;
diff --git a/src/batched/sparse/KokkosBatched_CG.hpp b/src/batched/sparse/KokkosBatched_CG.hpp
index e1e6b5d6a4..7fa1f7e04b 100644
--- a/src/batched/sparse/KokkosBatched_CG.hpp
+++ b/src/batched/sparse/KokkosBatched_CG.hpp
@@ -68,12 +68,13 @@ namespace KokkosBatched {
 
 template <typename MemberType, typename ArgMode>
 struct CG {
-  template <typename OperatorType, typename VectorViewType>
-  KOKKOS_INLINE_FUNCTION static int invoke(
-      const MemberType &member, const OperatorType &A, const VectorViewType &B,
-      const VectorViewType &X,
-      const KrylovHandle<typename VectorViewType::non_const_value_type>
-          &handle) {
+  template <typename OperatorType, typename VectorViewType,
+            typename KrylovHandleType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const OperatorType &A,
+                                           const VectorViewType &B,
+                                           const VectorViewType &X,
+                                           const KrylovHandleType &handle) {
     int status = 0;
     if (std::is_same<ArgMode, Mode::Team>::value) {
       status =
diff --git a/src/batched/sparse/KokkosBatched_CrsMatrix.hpp b/src/batched/sparse/KokkosBatched_CrsMatrix.hpp
index 5448c4684c..d7fd94744f 100644
--- a/src/batched/sparse/KokkosBatched_CrsMatrix.hpp
+++ b/src/batched/sparse/KokkosBatched_CrsMatrix.hpp
@@ -104,89 +104,37 @@ class CrsMatrix {
   /// \param beta [in]: input coefficient for Y (default value 0.)
   /// \param Y [in/out]: Output vector Y, a rank 2 view
 
-  template <typename MemberType, typename XViewType, typename YViewType,
-            typename ArgTrans, typename ArgMode>
+  template <typename ArgTrans, typename ArgMode, typename MemberType,
+            typename XViewType, typename YViewType>
   KOKKOS_INLINE_FUNCTION void apply(
       const MemberType &member, const XViewType &X, const YViewType &Y,
       MagnitudeType alpha = Kokkos::Details::ArithTraits<MagnitudeType>::one(),
       MagnitudeType beta =
           Kokkos::Details::ArithTraits<MagnitudeType>::zero()) const {
-    if (beta == 0)
-      KokkosBatched::Spmv<MemberType, ArgTrans, ArgMode>::template invoke<
+    if (beta == Kokkos::Details::ArithTraits<MagnitudeType>::zero())
+      KokkosBatched::TeamVectorSpmv<MemberType, ArgTrans>::template invoke<
           ValuesViewType, IntViewType, XViewType, YViewType, 0>(
           member, alpha, values, row_ptr, colIndices, X, beta, Y);
     else
-      KokkosBatched::Spmv<MemberType, ArgTrans, ArgMode>::template invoke<
+      KokkosBatched::TeamVectorSpmv<MemberType, ArgTrans>::template invoke<
           ValuesViewType, IntViewType, XViewType, YViewType, 1>(
           member, alpha, values, row_ptr, colIndices, X, beta, Y);
   }
 
-  /// \brief apply version that uses variable coefficient alpha and no beta
-  ///   y_l <- alpha_l * A_l * x_l  for all l = 1, ..., N
-  /// where:
-  ///   * N is the number of matrices,
-  ///   * A_1, ..., A_N are N sparse matrices which share the same sparsity
-  ///   pattern,
-  ///   * x_1, ..., x_N are the N input vectors,
-  ///   * y_1, ..., y_N are the N output vectors,
-  ///   * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N.
-  ///
-  /// \tparam MemberType: Input type for the TeamPolicy member
-  /// \tparam XViewType: Input type for X, needs to be a 2D view
-  /// \tparam YViewType: Input type for Y, needs to be a 2D view
-  /// \tparam ArgTrans: Argument for transpose or notranspose
-  /// \tparam ArgMode: Argument for the parallelism used in the apply
-  ///
-  /// \param member [in]: TeamPolicy member
-  /// \param alpha [in]: input coefficient for X, a rank 1 view
-  /// \param X [in]: Input vector X, a rank 2 view
-  /// \param Y [out]: Output vector Y, a rank 2 view
-
-  template <typename MemberType, typename XViewType, typename YViewType,
-            typename NormViewType, typename ArgTrans, typename ArgMode>
-  KOKKOS_INLINE_FUNCTION void apply(const MemberType &member,
-                                    const XViewType &X, const YViewType &Y,
-                                    NormViewType alpha) const {
-    KokkosBatched::Spmv<MemberType, ArgTrans, ArgMode>::template invoke<
-        ValuesViewType, IntViewType, XViewType, YViewType, NormViewType,
-        NormViewType, 0>(member, alpha, values, row_ptr, colIndices, X, alpha,
-                         Y);
-  }
-
-  /// \brief apply version that uses variable coefficients alpha and beta
-  ///   y_l <- alpha_l * A_l * x_l + beta_l * y_l for all l = 1, ..., N
-  /// where:
-  ///   * N is the number of matrices,
-  ///   * A_1, ..., A_N are N sparse matrices which share the same sparsity
-  ///   pattern,
-  ///   * x_1, ..., x_N are the N input vectors,
-  ///   * y_1, ..., y_N are the N output vectors,
-  ///   * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N,
-  ///   * beta_1, ..., beta_N are N scaling factors for y_1, ..., y_N.
-  ///
-  /// \tparam MemberType: Input type for the TeamPolicy member
-  /// \tparam XViewType: Input type for X, needs to be a 2D view
-  /// \tparam YViewType: Input type for Y, needs to be a 2D view
-  /// \tparam NormViewType: Input type for alpha and beta, needs to be a 1D view
-  /// \tparam ArgTrans: Argument for transpose or notranspose
-  /// \tparam ArgMode: Argument for the parallelism used in the apply
-  ///
-  /// \param member [in]: TeamPolicy member
-  /// \param alpha [in]: input coefficient for X, a rank 1 view
-  /// \param X [in]: Input vector X, a rank 2 view
-  /// \param beta [in]: input coefficient for Y, a rank 1 view
-  /// \param Y [in/out]: Output vector Y, a rank 2 view
-
-  template <typename MemberType, typename XViewType, typename YViewType,
-            typename NormViewType, typename ArgTrans, typename ArgMode>
-  KOKKOS_INLINE_FUNCTION void apply(const MemberType &member,
-                                    const XViewType &X, const YViewType &Y,
-                                    const NormViewType &alpha,
-                                    const NormViewType &beta) const {
-    KokkosBatched::Spmv<MemberType, ArgTrans, ArgMode>::template invoke<
-        ValuesViewType, IntViewType, XViewType, YViewType, NormViewType,
-        NormViewType, 1>(member, alpha, values, row_ptr, colIndices, X, beta,
-                         Y);
+  template <typename ArgTrans, typename XViewType, typename YViewType>
+  KOKKOS_INLINE_FUNCTION void apply(
+      const XViewType &X, const YViewType &Y,
+      MagnitudeType alpha = Kokkos::Details::ArithTraits<MagnitudeType>::one(),
+      MagnitudeType beta =
+          Kokkos::Details::ArithTraits<MagnitudeType>::zero()) const {
+    if (beta == Kokkos::Details::ArithTraits<MagnitudeType>::zero())
+      KokkosBatched::SerialSpmv<ArgTrans>::template invoke<
+          ValuesViewType, IntViewType, XViewType, YViewType, 0>(
+          alpha, values, row_ptr, colIndices, X, beta, Y);
+    else
+      KokkosBatched::SerialSpmv<ArgTrans>::template invoke<
+          ValuesViewType, IntViewType, XViewType, YViewType, 1>(
+          alpha, values, row_ptr, colIndices, X, beta, Y);
   }
 };
 
diff --git a/src/batched/sparse/KokkosBatched_GMRES.hpp b/src/batched/sparse/KokkosBatched_GMRES.hpp
index 512970006b..51efc24aed 100644
--- a/src/batched/sparse/KokkosBatched_GMRES.hpp
+++ b/src/batched/sparse/KokkosBatched_GMRES.hpp
@@ -60,7 +60,9 @@
 /// \param handle [in]: a handle which provides different information such as
 /// the tolerance or the maximal number of iterations of the solver.
 
+#include <KokkosBatched_Krylov_Solvers.hpp>
 #include "KokkosBatched_Krylov_Handle.hpp"
+#include "KokkosBatched_GMRES_Serial_Impl.hpp"
 #include "KokkosBatched_GMRES_Team_Impl.hpp"
 #include "KokkosBatched_GMRES_TeamVector_Impl.hpp"
 
@@ -68,14 +70,18 @@ namespace KokkosBatched {
 
 template <typename MemberType, typename ArgMode>
 struct GMRES {
-  template <typename OperatorType, typename VectorViewType>
-  KOKKOS_INLINE_FUNCTION static int invoke(
-      const MemberType &member, const OperatorType &A, const VectorViewType &B,
-      const VectorViewType &X,
-      const KrylovHandle<typename VectorViewType::non_const_value_type>
-          &handle) {
+  template <typename OperatorType, typename VectorViewType,
+            typename KrylovHandleType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const OperatorType &A,
+                                           const VectorViewType &B,
+                                           const VectorViewType &X,
+                                           const KrylovHandleType &handle) {
     int status = 0;
-    if (std::is_same<ArgMode, Mode::Team>::value) {
+    if (std::is_same<ArgMode, Mode::Serial>::value) {
+      status = SerialGMRES::template invoke<OperatorType, VectorViewType>(
+          A, B, X, handle);
+    } else if (std::is_same<ArgMode, Mode::Team>::value) {
       status =
           TeamGMRES<MemberType>::template invoke<OperatorType, VectorViewType>(
               member, A, B, X, handle);
diff --git a/src/batched/sparse/KokkosBatched_Identity.hpp b/src/batched/sparse/KokkosBatched_Identity.hpp
index 57934df66a..6613bdd1ec 100644
--- a/src/batched/sparse/KokkosBatched_Identity.hpp
+++ b/src/batched/sparse/KokkosBatched_Identity.hpp
@@ -60,8 +60,8 @@ class Identity {
   KOKKOS_INLINE_FUNCTION
   ~Identity() {}
 
-  template <typename MemberType, typename XViewType, typename YViewType,
-            typename ArgTrans, typename ArgMode, int sameXY>
+  template <typename ArgTrans, typename ArgMode, int sameXY,
+            typename MemberType, typename XViewType, typename YViewType>
   KOKKOS_INLINE_FUNCTION void apply(const MemberType &member,
                                     const XViewType &X,
                                     const YViewType &Y) const {
@@ -76,6 +76,14 @@ class Identity {
       }
     }
   }
+  template <typename ArgTrans, int sameXY, typename XViewType,
+            typename YViewType>
+  KOKKOS_INLINE_FUNCTION void apply(const XViewType &X,
+                                    const YViewType &Y) const {
+    if (sameXY == 0) {
+      SerialCopy<Trans::NoTranspose>::invoke(X, Y);
+    }
+  }
 };
 
 }  // namespace KokkosBatched
diff --git a/src/batched/sparse/KokkosBatched_JacobiPrec.hpp b/src/batched/sparse/KokkosBatched_JacobiPrec.hpp
index 129378ed43..e4bfbefd0f 100644
--- a/src/batched/sparse/KokkosBatched_JacobiPrec.hpp
+++ b/src/batched/sparse/KokkosBatched_JacobiPrec.hpp
@@ -77,6 +77,8 @@ class JacobiPrec {
   KOKKOS_INLINE_FUNCTION
   ~JacobiPrec() {}
 
+  KOKKOS_INLINE_FUNCTION void setComputedInverse() { computed_inverse = true; }
+
   template <typename MemberType, typename ArgMode>
   KOKKOS_INLINE_FUNCTION void computeInverse(const MemberType &member) const {
     auto one     = Kokkos::Details::ArithTraits<MagnitudeType>::one();
@@ -141,8 +143,30 @@ class JacobiPrec {
     computed_inverse = true;
   }
 
-  template <typename MemberType, typename XViewType, typename YViewType,
-            typename ArgTrans, typename ArgMode, int sameXY>
+  KOKKOS_INLINE_FUNCTION void computeInverse() const {
+    auto one     = Kokkos::Details::ArithTraits<MagnitudeType>::one();
+    auto epsilon = Kokkos::Details::ArithTraits<MagnitudeType>::epsilon();
+    int tooSmall = 0;
+
+    for (int i = 0; i < n_operators; ++i)
+      for (int j = 0; j < n_colums; ++j) {
+        if (Kokkos::abs<ScalarType>(diag_values(i, j)) <= epsilon) {
+          ++tooSmall;
+          diag_values(i, j) = one;
+        } else
+          diag_values(i, j) = one / diag_values(i, j);
+      }
+
+    if (tooSmall > 0)
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::JacobiPrec: %d entrie(s) has/have a too small "
+          "magnitude and have been replaced by one, \n",
+          (int)tooSmall);
+    computed_inverse = true;
+  }
+
+  template <typename ArgTrans, typename ArgMode, int sameXY,
+            typename MemberType, typename XViewType, typename YViewType>
   KOKKOS_INLINE_FUNCTION void apply(const MemberType &member,
                                     const XViewType &X,
                                     const YViewType &Y) const {
@@ -154,6 +178,19 @@ class JacobiPrec {
     KokkosBatched::HadamardProduct<MemberType, ArgMode>::template invoke<
         ValuesViewType, XViewType, YViewType>(member, diag_values, X, Y);
   }
+
+  template <typename ArgTrans, int sameXY, typename XViewType,
+            typename YViewType>
+  KOKKOS_INLINE_FUNCTION void apply(const XViewType &X,
+                                    const YViewType &Y) const {
+    if (!computed_inverse) {
+      this->computeInverse();
+    }
+
+    KokkosBatched::SerialHadamardProduct::template invoke<ValuesViewType,
+                                                          XViewType, YViewType>(
+        diag_values, X, Y);
+  }
 };
 
 }  // namespace KokkosBatched
diff --git a/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp b/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp
index f14eac7065..3467a6f910 100644
--- a/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp
+++ b/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp
@@ -42,35 +42,194 @@
 //@HEADER
 */
 
-#include <Kokkos_Core.hpp>
-#include <iostream>
-#include <string>
-
 #ifndef __KOKKOSBATCHED_KRYLOV_HANDLE_HPP__
 #define __KOKKOSBATCHED_KRYLOV_HANDLE_HPP__
-//#define VERBOSE
+
+#include <KokkosBatched_Krylov_Solvers.hpp>
+#include <Kokkos_Core.hpp>
 
 namespace KokkosBatched {
 
 /// \brief KrylovHandle
 ///
-/// \tparam scalar_type: Scalar type of the linear solver
+/// The handle is used to pass information between the Krylov solver and the
+/// calling code.
+///
+/// The handle has some views as data member, their required size can be
+/// different depending on the used Krylov solver.
+///
+/// In the case of the Batched GMRES, the size should be as follows:
+///  - Arnoldi_view a batched_size x max_iteration x (n_rows + max_iteration +
+///  3);
+///  - tmp_view is NOT used for the team/teamvector GMRES;
+///    it is used for the serial GMRES and the size is batched_size x (n_rows +
+///    max_iteration + 3);
+///  - residual_norms is an optional batched_size x (max_iteration + 2) used to
+///  store the convergence history;
+///  - iteration_numbers is a 1D view of length batched_size;
+///  - first_index and last_index are 1D of length n_teams.
+///
+/// \tparam NormViewType: type of the view used to store the convergence history
+/// \tparam IntViewType: type of the view used to store the number of iteration
+/// per system \tparam ViewType3D: type of the 3D temporary views
 
-template <class scalar_type>
+template <class NormViewType, class IntViewType, class ViewType3D>
 class KrylovHandle {
  public:
-  using norm_type =
-      typename Kokkos::Details::ArithTraits<scalar_type>::mag_type;
+  using norm_type = typename NormViewType::non_const_value_type;
+
+  typedef ViewType3D ArnoldiViewType;
+  typedef Kokkos::View<typename ViewType3D::non_const_value_type **,
+                       typename ViewType3D::array_layout,
+                       typename ViewType3D::execution_space>
+      TemporaryViewType;
+
+ public:
+  NormViewType residual_norms;
+  IntViewType iteration_numbers;
+  typename NormViewType::HostMirror residual_norms_host;
+  typename IntViewType::HostMirror iteration_numbers_host;
+  IntViewType first_index;
+  IntViewType last_index;
+  ArnoldiViewType Arnoldi_view;
+  TemporaryViewType tmp_view;
 
  private:
   norm_type tolerance;
+  norm_type max_tolerance;
   int max_iteration;
+  int batched_size;
+  const int N_team;
+  int n_teams;
+  int ortho_strategy;
+  int scratch_pad_level;
+  bool compute_last_residual;
+  bool monitor_residual;
+  bool host_synchronised;
 
  public:
-  KOKKOS_INLINE_FUNCTION
-  KrylovHandle() {
+  KrylovHandle(int _batched_size, int _N_team, int _max_iteration = 200,
+               bool _monitor_residual = false)
+      : max_iteration(_max_iteration),
+        batched_size(_batched_size),
+        N_team(_N_team),
+        monitor_residual(_monitor_residual) {
     tolerance     = Kokkos::Details::ArithTraits<norm_type>::epsilon();
-    max_iteration = 200;
+    max_tolerance = 1e-30;
+    if (std::is_same<norm_type, double>::value) max_tolerance = 1e-50;
+    if (monitor_residual) {
+      residual_norms = NormViewType("", batched_size, max_iteration + 2);
+    }
+    iteration_numbers = IntViewType("", batched_size);
+    Kokkos::deep_copy(iteration_numbers, -1);
+
+    n_teams     = ceil(1. * batched_size / N_team);
+    first_index = IntViewType("", n_teams);
+    last_index  = IntViewType("", n_teams);
+
+    auto first_index_host = Kokkos::create_mirror_view(first_index);
+    auto last_index_host  = Kokkos::create_mirror_view(last_index);
+
+    first_index_host(0) = 0;
+    last_index_host(0)  = N_team;
+    for (int i = 1; i < n_teams; ++i) {
+      first_index_host(i) = last_index_host(i - 1);
+      last_index_host(i)  = first_index_host(i) + N_team;
+    }
+    last_index_host(n_teams - 1) = batched_size;
+
+    Kokkos::deep_copy(first_index, first_index_host);
+    Kokkos::deep_copy(last_index, last_index_host);
+
+    // Default Classical GS
+    ortho_strategy        = 1;
+    scratch_pad_level     = 0;
+    compute_last_residual = true;
+    host_synchronised     = false;
+  }
+
+  /// \brief get_number_of_systems_per_team
+  int get_number_of_systems_per_team() { return N_team; }
+
+  /// \brief get_number_of_teams
+  int get_number_of_teams() { return n_teams; }
+
+  /// \brief reset
+  ///   Reset the iteration numbers to the default value of -1
+  ///   and the residual norms if monitored.
+  ///   (Usefull when mulitple consecutive solvers use the same handle)
+  ///
+
+  void reset() {
+    Kokkos::deep_copy(iteration_numbers, -1);
+    if (monitor_residual) {
+      Kokkos::deep_copy(residual_norms, 0.);
+    }
+    host_synchronised = false;
+  }
+
+  /// \brief synchronise_host
+  ///   Synchronise host and device.
+  ///
+
+  void synchronise_host() {
+    iteration_numbers_host = Kokkos::create_mirror_view(iteration_numbers);
+    Kokkos::deep_copy(iteration_numbers_host, iteration_numbers);
+    if (monitor_residual) {
+      residual_norms_host = Kokkos::create_mirror_view(residual_norms);
+      Kokkos::deep_copy(residual_norms_host, residual_norms);
+    }
+    host_synchronised = true;
+  }
+
+  /// \brief is_converged
+  ///   Test if all the systems have converged.
+  ///
+
+  KOKKOS_INLINE_FUNCTION
+  bool is_converged() const {
+    bool all_converged = true;
+    for (size_t i = 0; i < batched_size; ++i)
+      if (iteration_numbers(i) == -1) {
+        all_converged = false;
+        break;
+      }
+    return all_converged;
+  }
+
+  /// \brief is_converged_host
+  ///   Test if all the systems have converged (host).
+  ///
+
+  bool is_converged_host() {
+    if (!host_synchronised) this->synchronise_host();
+    bool all_converged = true;
+    for (int i = 0; i < batched_size; ++i)
+      if (iteration_numbers_host(i) == -1) {
+        all_converged = false;
+        break;
+      }
+    return all_converged;
+  }
+
+  /// \brief is_converged
+  ///   Test if one particular system has converged.
+  ///
+  /// \param batched_id [in]: Global batched ID
+
+  KOKKOS_INLINE_FUNCTION
+  bool is_converged(int batched_id) const {
+    return (iteration_numbers(batched_id) != -1);
+  }
+
+  /// \brief is_converged
+  ///   Test if one particular system has converged (host).
+  ///
+  /// \param batched_id [in]: Global batched ID
+
+  bool is_converged_host(int batched_id) {
+    if (!host_synchronised) this->synchronise_host();
+    return (iteration_numbers_host(batched_id) != -1);
   }
 
   /// \brief set_tolerance
@@ -87,21 +246,259 @@ class KrylovHandle {
   KOKKOS_INLINE_FUNCTION
   norm_type get_tolerance() const { return tolerance; }
 
+  /// \brief set_max_tolerance
+  ///   Set the maximal tolerance of the batched Krylov solver
+  ///
+  /// \param _max_tolerance [in]: New tolerance
+
+  KOKKOS_INLINE_FUNCTION
+  void set_max_tolerance(norm_type _max_tolerance) {
+    max_tolerance = _max_tolerance;
+  }
+
+  /// \brief get_max_tolerance
+  ///   Get the maximal tolerance of the batched Krylov solver
+
+  KOKKOS_INLINE_FUNCTION
+  norm_type get_max_tolerance() const { return max_tolerance; }
+
   /// \brief set_max_iteration
   ///   Set the maximum number of iterations of the batched Krylov solver
   ///
   /// \param _max_iteration [in]: New maximum number of iterations
 
   KOKKOS_INLINE_FUNCTION
-  void set_max_iteration(norm_type _max_iteration) {
-    max_iteration = _max_iteration;
-  }
+  void set_max_iteration(int _max_iteration) { max_iteration = _max_iteration; }
 
   /// \brief get_max_iteration
   ///   Get the maximum number of iterations of the batched Krylov solver
 
   KOKKOS_INLINE_FUNCTION
   int get_max_iteration() const { return max_iteration; }
+
+  /// \brief get_norm
+  ///   Get the norm of one system at a given iteration
+  ///
+  /// \param batched_id [in]: Global batched ID
+  /// \param iteration_id [in]: Iteration ID
+
+  KOKKOS_INLINE_FUNCTION
+  norm_type get_norm(int batched_id, int iteration_id) const {
+    if (monitor_residual) {
+      return residual_norms(batched_id, iteration_id);
+    } else
+      return 0;
+  }
+
+  /// \brief get_norm_host
+  ///   Get the norm of one system at a given iteration (host)
+  ///
+  /// \param batched_id [in]: Global batched ID
+  /// \param iteration_id [in]: Iteration ID
+
+  norm_type get_norm_host(int batched_id, int iteration_id) {
+    if (monitor_residual) {
+      if (!host_synchronised) this->synchronise_host();
+      return residual_norms_host(batched_id, iteration_id);
+    } else
+      return 0;
+  }
+
+  /// \brief get_last_norm
+  ///   Get the last norm of one system
+  ///
+  /// \param batched_id [in]: Global batched ID
+
+  KOKKOS_INLINE_FUNCTION
+  norm_type get_last_norm(int batched_id) const {
+    if (monitor_residual && compute_last_residual) {
+      return residual_norms(batched_id, max_iteration + 1);
+    } else
+      return 0;
+  }
+
+  /// \brief get_last_norm_host
+  ///   Get the last norm of one system (host)
+  ///
+  /// \param batched_id [in]: Global batched ID
+
+  norm_type get_last_norm_host(int batched_id) {
+    if (monitor_residual && compute_last_residual) {
+      if (!host_synchronised) this->synchronise_host();
+      return residual_norms_host(batched_id, max_iteration + 1);
+    } else
+      return 0;
+  }
+
+  /// \brief get_iteration
+  ///   Get the number of iteration after convergence for one system
+  ///
+  /// \param batched_id [in]: Global batched ID
+
+  KOKKOS_INLINE_FUNCTION
+  int get_iteration(int batched_id) const {
+    return iteration_numbers(batched_id);
+  }
+
+  /// \brief get_iteration_host
+  ///   Get the number of iteration after convergence for one system (host)
+  ///
+  /// \param batched_id [in]: Global batched ID
+
+  int get_iteration_host(int batched_id) {
+    if (!host_synchronised) this->synchronise_host();
+    return iteration_numbers_host(batched_id);
+  }
+
+  /// \brief set_ortho_strategy
+  ///   Set the used orthogonalization strategy.
+  ///   Either classical GS (_ortho_strategy=0) or modified GS
+  ///   (_ortho_strategy=1)
+  ///
+  /// \param _ortho_strategy [in]: used orthogonalization strategy
+
+  KOKKOS_INLINE_FUNCTION
+  void set_ortho_strategy(int _ortho_strategy) {
+    ortho_strategy = _ortho_strategy;
+  }
+
+  /// \brief get_ortho_strategy
+  ///   Get the used orthogonalization strategy.
+  ///   Either classical GS (_ortho_strategy=0) or modified GS
+  ///   (_ortho_strategy=1)
+
+  KOKKOS_INLINE_FUNCTION
+  int get_ortho_strategy() const { return ortho_strategy; }
+
+  /// \brief set_scratch_pad_level
+  ///   Set the scratch pad level used to store temporary variables.
+  ///
+  /// \param _scratch_pad_level [in]: used level
+
+  KOKKOS_INLINE_FUNCTION
+  void set_scratch_pad_level(int _scratch_pad_level) {
+    scratch_pad_level = _scratch_pad_level;
+  }
+
+  /// \brief get_scratch_pad_level
+  ///   Get the scratch pad level used to store temporary variables.
+
+  KOKKOS_INLINE_FUNCTION
+  int get_scratch_pad_level() const { return scratch_pad_level; }
+
+  /// \brief set_compute_last_residual
+  ///   Select if the last residual is explicitly computed.
+  ///
+  /// \param _compute_last_residual [in]: boolean that specifies if we compute
+  /// the last residual explicitly
+
+  KOKKOS_INLINE_FUNCTION
+  void set_compute_last_residual(bool _compute_last_residual) {
+    if (monitor_residual)
+      compute_last_residual = _compute_last_residual;
+    else
+      compute_last_residual = false;
+  }
+
+  /// \brief get_compute_last_residual
+  ///   Specify if the last residual has to be computed explicitly.
+
+  KOKKOS_INLINE_FUNCTION
+  bool get_compute_last_residual() const {
+    if (monitor_residual)
+      return compute_last_residual;
+    else
+      return false;
+  }
+
+ private:
+  /// \brief set_norm
+  ///   Store the norm of one of the system at one of the iteration
+  ///
+  /// \param batched_id [in]: Global batched ID
+  /// \param iteration_id [in]: Iteration ID
+  /// \param norm_i [in]: Norm to store
+
+  KOKKOS_INLINE_FUNCTION
+  void set_norm(int batched_id, int iteration_id, norm_type norm_i) const {
+    if (monitor_residual) residual_norms(batched_id, iteration_id) = norm_i;
+  }
+
+  /// \brief set_norm
+  ///   Store the norm of one of the system at one of the iteration
+  ///
+  /// \param batchedteam_id [in]: Team ID
+  /// \param batched_id [in]: Local batched ID (local ID within the team)
+  /// \param iteration_id [in]: Iteration ID
+  /// \param norm_i [in]: Norm to store
+
+  KOKKOS_INLINE_FUNCTION
+  void set_norm(int team_id, int batched_id, int iteration_id,
+                norm_type norm_i) const {
+    if (monitor_residual)
+      residual_norms(team_id * N_team + batched_id, iteration_id) = norm_i;
+  }
+
+  /// \brief set_last_norm
+  ///   Store the last norm of one system
+  ///
+  /// \param batched_id [in]: Global batched ID
+  /// \param norm_i [in]: Norm to store
+
+  KOKKOS_INLINE_FUNCTION
+  void set_last_norm(int batched_id, norm_type norm_i) const {
+    if (monitor_residual)
+      residual_norms(batched_id, max_iteration + 1) = norm_i;
+  }
+
+  /// \brief set_last_norm
+  ///   Store the last norm of one system
+  ///
+  /// \param batchedteam_id [in]: Team ID
+  /// \param batched_id [in]: Local batched ID (local ID within the team)
+  /// \param batched_id [in]: Global batched ID
+  /// \param norm_i [in]: Norm to store
+
+  KOKKOS_INLINE_FUNCTION
+  void set_last_norm(int team_id, int batched_id, norm_type norm_i) const {
+    if (monitor_residual)
+      residual_norms(team_id * N_team + batched_id, max_iteration + 1) = norm_i;
+  }
+
+  /// \brief set_iteration
+  ///   Store the number of iteration after convergence for one system
+  ///
+  /// \param batched_id [in]: Global batched ID
+  /// \param iteration_id [in]: Iteration ID
+
+  KOKKOS_INLINE_FUNCTION
+  void set_iteration(int batched_id, int iteration_id) const {
+    iteration_numbers(batched_id) = iteration_id;
+  }
+
+  /// \brief set_iteration
+  ///   Store the number of iteration after convergence for one system
+  ///
+  /// \param batchedteam_id [in]: Team ID
+  /// \param batched_id [in]: Local batched ID (local ID within the team)
+  /// \param iteration_id [in]: Iteration ID
+
+  KOKKOS_INLINE_FUNCTION
+  void set_iteration(int team_id, int batched_id, int iteration_id) const {
+    iteration_numbers(team_id * N_team + batched_id) = iteration_id;
+  }
+
+ public:
+  friend struct SerialGMRES;
+  template <typename MemberType>
+  friend struct TeamGMRES;
+  template <typename MemberType>
+  friend struct TeamVectorGMRES;
+
+  template <typename MemberType>
+  friend struct TeamCG;
+  template <typename MemberType>
+  friend struct TeamVectorCG;
 };
 
 }  // namespace KokkosBatched
diff --git a/src/batched/sparse/KokkosBatched_Krylov_Solvers.hpp b/src/batched/sparse/KokkosBatched_Krylov_Solvers.hpp
new file mode 100644
index 0000000000..413c72678f
--- /dev/null
+++ b/src/batched/sparse/KokkosBatched_Krylov_Solvers.hpp
@@ -0,0 +1,129 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef __KOKKOSBATCHED_KRYLOV_SOLVERS_HPP__
+#define __KOKKOSBATCHED_KRYLOV_SOLVERS_HPP__
+
+namespace KokkosBatched {
+
+struct SerialGMRES {
+  template <typename OperatorType, typename VectorViewType,
+            typename PrecOperatorType, typename KrylovHandleType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const OperatorType& A,
+                                           const VectorViewType& _B,
+                                           const VectorViewType& _X,
+                                           const PrecOperatorType& P,
+                                           const KrylovHandleType& handle,
+                                           const int GMRES_id);
+  template <typename OperatorType, typename VectorViewType,
+            typename KrylovHandleType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const OperatorType& A,
+                                           const VectorViewType& _B,
+                                           const VectorViewType& _X,
+                                           const KrylovHandleType& handle);
+};
+
+template <typename MemberType>
+struct TeamGMRES {
+  template <typename OperatorType, typename VectorViewType,
+            typename PrecOperatorType, typename KrylovHandleType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member,
+                                           const OperatorType& A,
+                                           const VectorViewType& _B,
+                                           const VectorViewType& _X,
+                                           const PrecOperatorType& P,
+                                           const KrylovHandleType& handle);
+  template <typename OperatorType, typename VectorViewType,
+            typename KrylovHandleType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member,
+                                           const OperatorType& A,
+                                           const VectorViewType& _B,
+                                           const VectorViewType& _X,
+                                           const KrylovHandleType& handle);
+};
+
+template <typename MemberType>
+struct TeamVectorGMRES {
+  template <typename OperatorType, typename VectorViewType,
+            typename PrecOperatorType, typename KrylovHandleType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member,
+                                           const OperatorType& A,
+                                           const VectorViewType& _B,
+                                           const VectorViewType& _X,
+                                           const PrecOperatorType& P,
+                                           const KrylovHandleType& handle);
+  template <typename OperatorType, typename VectorViewType,
+            typename KrylovHandleType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member,
+                                           const OperatorType& A,
+                                           const VectorViewType& _B,
+                                           const VectorViewType& _X,
+                                           const KrylovHandleType& handle);
+};
+
+template <typename MemberType>
+struct TeamCG {
+  template <typename OperatorType, typename VectorViewType,
+            typename KrylovHandleType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member,
+                                           const OperatorType& A,
+                                           const VectorViewType& _B,
+                                           const VectorViewType& _X,
+                                           const KrylovHandleType& handle);
+};
+
+template <typename MemberType>
+struct TeamVectorCG {
+  template <typename OperatorType, typename VectorViewType,
+            typename KrylovHandleType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member,
+                                           const OperatorType& A,
+                                           const VectorViewType& _B,
+                                           const VectorViewType& _X,
+                                           const KrylovHandleType& handle);
+};
+
+}  // namespace KokkosBatched
+
+#endif
diff --git a/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp
index 83e8fb90ed..a106d0ae8f 100644
--- a/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp
+++ b/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp
@@ -61,139 +61,145 @@ namespace KokkosBatched {
 ///
 
 template <typename MemberType>
-struct TeamVectorCG {
-  template <typename OperatorType, typename VectorViewType>
-  KOKKOS_INLINE_FUNCTION static int invoke(
-      const MemberType& member, const OperatorType& A, const VectorViewType& _B,
-      const VectorViewType& _X,
-      const KrylovHandle<typename VectorViewType::non_const_value_type>&
-          handle) {
-    typedef int OrdinalType;
-    typedef typename Kokkos::Details::ArithTraits<
-        typename VectorViewType::non_const_value_type>::mag_type MagnitudeType;
-
-    const size_t maximum_iteration = handle.get_max_iteration();
-    const MagnitudeType tolerance  = handle.get_tolerance();
-
-    using ScratchPadNormViewType = Kokkos::View<
-        MagnitudeType*,
-        typename VectorViewType::execution_space::scratch_memory_space>;
-    using ScratchPadVectorViewType = Kokkos::View<
-        typename VectorViewType::non_const_value_type**,
-        typename VectorViewType::array_layout,
-        typename VectorViewType::execution_space::scratch_memory_space>;
-    using TeamVectorCopy1D = TeamVectorCopy<MemberType, Trans::NoTranspose, 1>;
-
-    const OrdinalType numMatrices = _X.extent(0);
-    const OrdinalType numRows     = _X.extent(1);
-
-    ScratchPadVectorViewType P(member.team_scratch(0), numMatrices, numRows);
-    ScratchPadVectorViewType Q(member.team_scratch(0), numMatrices, numRows);
-    ScratchPadVectorViewType R(member.team_scratch(0), numMatrices, numRows);
-    ScratchPadVectorViewType X(member.team_scratch(0), numMatrices, numRows);
-
-    ScratchPadNormViewType sqr_norm_0(member.team_scratch(0), numMatrices);
-    ScratchPadNormViewType sqr_norm_j(member.team_scratch(0), numMatrices);
-    ScratchPadNormViewType alpha(member.team_scratch(0), numMatrices);
-    ScratchPadNormViewType mask(member.team_scratch(0), numMatrices);
-    ScratchPadNormViewType tmp(member.team_scratch(0), numMatrices);
-
-    TeamVectorCopy<MemberType>::invoke(member, _X, X);
-    // Deep copy of b into r_0:
-    TeamVectorCopy<MemberType>::invoke(member, _B, R);
-
-    // r_0 := b - A x_0
+template <typename OperatorType, typename VectorViewType,
+          typename KrylovHandleType>
+KOKKOS_INLINE_FUNCTION int TeamVectorCG<MemberType>::invoke(
+    const MemberType& member, const OperatorType& A, const VectorViewType& _B,
+    const VectorViewType& _X, const KrylovHandleType& handle) {
+  typedef int OrdinalType;
+  typedef typename Kokkos::Details::ArithTraits<
+      typename VectorViewType::non_const_value_type>::mag_type MagnitudeType;
+
+  const size_t maximum_iteration = handle.get_max_iteration();
+  const MagnitudeType tolerance  = handle.get_tolerance();
+
+  using ScratchPadNormViewType = Kokkos::View<
+      MagnitudeType*,
+      typename VectorViewType::execution_space::scratch_memory_space>;
+  using ScratchPadVectorViewType = Kokkos::View<
+      typename VectorViewType::non_const_value_type**,
+      typename VectorViewType::array_layout,
+      typename VectorViewType::execution_space::scratch_memory_space>;
+  using TeamVectorCopy1D = TeamVectorCopy<MemberType, Trans::NoTranspose, 1>;
+
+  const OrdinalType numMatrices = _X.extent(0);
+  const OrdinalType numRows     = _X.extent(1);
+
+  ScratchPadVectorViewType P(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices,
+      numRows);
+  ScratchPadVectorViewType Q(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices,
+      numRows);
+  ScratchPadVectorViewType R(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices,
+      numRows);
+  ScratchPadVectorViewType X(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices,
+      numRows);
+
+  ScratchPadNormViewType sqr_norm_0(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices);
+  ScratchPadNormViewType sqr_norm_j(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices);
+  ScratchPadNormViewType alpha(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices);
+  ScratchPadNormViewType mask(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices);
+  ScratchPadNormViewType tmp(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices);
+
+  TeamVectorCopy<MemberType>::invoke(member, _X, X);
+  // Deep copy of b into r_0:
+  TeamVectorCopy<MemberType>::invoke(member, _B, R);
+
+  // r_0 := b - A x_0
+  member.team_barrier();
+  A.template apply<Trans::NoTranspose, Mode::TeamVector>(member, X, R, -1, 1);
+  member.team_barrier();
+
+  // Deep copy of r_0 into p_0:
+  TeamVectorCopy<MemberType>::invoke(member, R, P);
+
+  TeamVectorDot<MemberType>::invoke(member, R, R, sqr_norm_0);
+  member.team_barrier();
+
+  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
+                       [&](const OrdinalType& i) {
+                         mask(i) =
+                             sqr_norm_0(i) > tolerance * tolerance ? 1. : 0;
+                       });
+
+  TeamVectorCopy1D::invoke(member, sqr_norm_0, sqr_norm_j);
+
+  int status               = 1;
+  int number_not_converged = 0;
+
+  for (size_t j = 0; j < maximum_iteration; ++j) {
+    // q := A p_j
+    A.template apply<Trans::NoTranspose, Mode::TeamVector>(member, P, Q);
     member.team_barrier();
-    A.template apply<MemberType, ScratchPadVectorViewType,
-                     ScratchPadVectorViewType, Trans::NoTranspose,
-                     Mode::TeamVector>(member, X, R, -1, 1);
+
+    TeamVectorDot<MemberType>::invoke(member, P, Q, tmp);
+    member.team_barrier();
+
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
+                         [&](const OrdinalType& i) {
+                           alpha(i) =
+                               mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.;
+                         });
+    member.team_barrier();
+
+    // x_{j+1} := alpha p_j + x_j
+    TeamVectorAxpy<MemberType>::invoke(member, alpha, P, X);
+    member.team_barrier();
+
+    // r_{j+1} := - alpha q + r_j
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
+                         [&](const OrdinalType& i) { alpha(i) = -alpha(i); });
     member.team_barrier();
 
-    // Deep copy of r_0 into p_0:
-    TeamVectorCopy<MemberType>::invoke(member, R, P);
+    TeamVectorAxpy<MemberType>::invoke(member, alpha, Q, R);
+    member.team_barrier();
 
-    TeamVectorDot<MemberType>::invoke(member, R, R, sqr_norm_0);
+    TeamVectorDot<MemberType>::invoke(member, R, R, tmp);
     member.team_barrier();
 
     Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
                          [&](const OrdinalType& i) {
-                           mask(i) =
-                               sqr_norm_0(i) > tolerance * tolerance ? 1. : 0;
+                           alpha(i) =
+                               mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.;
                          });
 
-    TeamVectorCopy1D::invoke(member, sqr_norm_0, sqr_norm_j);
-
-    int status               = 1;
-    int number_not_converged = 0;
-
-    for (size_t j = 0; j < maximum_iteration; ++j) {
-      // q := A p_j
-      A.template apply<MemberType, ScratchPadVectorViewType,
-                       ScratchPadVectorViewType, Trans::NoTranspose,
-                       Mode::TeamVector>(member, P, Q);
-      member.team_barrier();
-
-      TeamVectorDot<MemberType>::invoke(member, P, Q, tmp);
-      member.team_barrier();
-
-      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
-                           [&](const OrdinalType& i) {
-                             alpha(i) =
-                                 mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.;
-                           });
-      member.team_barrier();
-
-      // x_{j+1} := alpha p_j + x_j
-      TeamVectorAxpy<MemberType>::invoke(member, alpha, P, X);
-      member.team_barrier();
-
-      // r_{j+1} := - alpha q + r_j
-      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
-                           [&](const OrdinalType& i) { alpha(i) = -alpha(i); });
-      member.team_barrier();
-
-      TeamVectorAxpy<MemberType>::invoke(member, alpha, Q, R);
-      member.team_barrier();
-
-      TeamVectorDot<MemberType>::invoke(member, R, R, tmp);
-      member.team_barrier();
-
-      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
-                           [&](const OrdinalType& i) {
-                             alpha(i) =
-                                 mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.;
-                           });
-
-      TeamVectorCopy1D::invoke(member, tmp, sqr_norm_j);
-
-      // Relative convergence check:
-      number_not_converged = 0;
-      Kokkos::parallel_reduce(
-          Kokkos::TeamVectorRange(member, 0, numMatrices),
-          [&](const OrdinalType& i, int& lnumber_not_converged) {
-            if (sqr_norm_j(i) / sqr_norm_0(i) > tolerance * tolerance)
-              ++lnumber_not_converged;
-            else
-              mask(i) = 0.;
-          },
-          number_not_converged);
-
-      member.team_barrier();
-
-      if (number_not_converged == 0) {
-        status = 0;
-        break;
-      }
-
-      // p_{j+1} := alpha p_j + r_{j+1}
-      TeamVectorXpay<MemberType>::invoke(member, alpha, R, P);
-      member.team_barrier();
+    TeamVectorCopy1D::invoke(member, tmp, sqr_norm_j);
+
+    // Relative convergence check:
+    number_not_converged = 0;
+    Kokkos::parallel_reduce(
+        Kokkos::TeamVectorRange(member, 0, numMatrices),
+        [&](const OrdinalType& i, int& lnumber_not_converged) {
+          if (sqr_norm_j(i) / sqr_norm_0(i) > tolerance * tolerance)
+            ++lnumber_not_converged;
+          else
+            mask(i) = 0.;
+        },
+        number_not_converged);
+
+    member.team_barrier();
+
+    if (number_not_converged == 0) {
+      status = 0;
+      break;
     }
 
-    TeamVectorCopy<MemberType>::invoke(member, X, _X);
-    return status;
+    // p_{j+1} := alpha p_j + r_{j+1}
+    TeamVectorXpay<MemberType>::invoke(member, alpha, R, P);
+    member.team_barrier();
   }
-};
+
+  TeamVectorCopy<MemberType>::invoke(member, X, _X);
+  return status;
+}
 }  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp
index 2bc611aa32..cd7a478548 100644
--- a/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp
+++ b/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp
@@ -60,139 +60,145 @@ namespace KokkosBatched {
 ///
 
 template <typename MemberType>
-struct TeamCG {
-  template <typename OperatorType, typename VectorViewType>
-  KOKKOS_INLINE_FUNCTION static int invoke(
-      const MemberType& member, const OperatorType& A, const VectorViewType& _B,
-      const VectorViewType& _X,
-      const KrylovHandle<typename VectorViewType::non_const_value_type>&
-          handle) {
-    typedef int OrdinalType;
-    typedef typename Kokkos::Details::ArithTraits<
-        typename VectorViewType::non_const_value_type>::mag_type MagnitudeType;
-
-    size_t maximum_iteration      = handle.get_max_iteration();
-    const MagnitudeType tolerance = handle.get_tolerance();
-
-    using ScratchPadNormViewType = Kokkos::View<
-        MagnitudeType*,
-        typename VectorViewType::execution_space::scratch_memory_space>;
-    using ScratchPadVectorViewType = Kokkos::View<
-        typename VectorViewType::non_const_value_type**,
-        typename VectorViewType::array_layout,
-        typename VectorViewType::execution_space::scratch_memory_space>;
-    using TeamCopy1D = TeamCopy<MemberType, Trans::NoTranspose, 1>;
-
-    const OrdinalType numMatrices = _X.extent(0);
-    const OrdinalType numRows     = _X.extent(1);
-
-    ScratchPadVectorViewType P(member.team_scratch(0), numMatrices, numRows);
-    ScratchPadVectorViewType Q(member.team_scratch(0), numMatrices, numRows);
-    ScratchPadVectorViewType R(member.team_scratch(0), numMatrices, numRows);
-    ScratchPadVectorViewType X(member.team_scratch(0), numMatrices, numRows);
-
-    ScratchPadNormViewType sqr_norm_0(member.team_scratch(0), numMatrices);
-    ScratchPadNormViewType sqr_norm_j(member.team_scratch(0), numMatrices);
-    ScratchPadNormViewType alpha(member.team_scratch(0), numMatrices);
-    ScratchPadNormViewType mask(member.team_scratch(0), numMatrices);
-    ScratchPadNormViewType tmp(member.team_scratch(0), numMatrices);
-
-    TeamCopy<MemberType>::invoke(member, _X, X);
-    // Deep copy of b into r_0:
-    TeamCopy<MemberType>::invoke(member, _B, R);
-
-    // r_0 := b - A x_0
+template <typename OperatorType, typename VectorViewType, typename KrylovHandle>
+KOKKOS_INLINE_FUNCTION int TeamCG<MemberType>::invoke(
+    const MemberType& member, const OperatorType& A, const VectorViewType& _B,
+    const VectorViewType& _X, const KrylovHandle& handle) {
+  typedef int OrdinalType;
+  typedef typename Kokkos::Details::ArithTraits<
+      typename VectorViewType::non_const_value_type>::mag_type MagnitudeType;
+
+  size_t maximum_iteration      = handle.get_max_iteration();
+  const MagnitudeType tolerance = handle.get_tolerance();
+
+  using ScratchPadNormViewType = Kokkos::View<
+      MagnitudeType*,
+      typename VectorViewType::execution_space::scratch_memory_space>;
+  using ScratchPadVectorViewType = Kokkos::View<
+      typename VectorViewType::non_const_value_type**,
+      typename VectorViewType::array_layout,
+      typename VectorViewType::execution_space::scratch_memory_space>;
+  using TeamCopy1D = TeamCopy<MemberType, Trans::NoTranspose, 1>;
+
+  const OrdinalType numMatrices = _X.extent(0);
+  const OrdinalType numRows     = _X.extent(1);
+
+  ScratchPadVectorViewType P(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices,
+      numRows);
+  ScratchPadVectorViewType Q(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices,
+      numRows);
+  ScratchPadVectorViewType R(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices,
+      numRows);
+  ScratchPadVectorViewType X(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices,
+      numRows);
+
+  ScratchPadNormViewType sqr_norm_0(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices);
+  ScratchPadNormViewType sqr_norm_j(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices);
+  ScratchPadNormViewType alpha(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices);
+  ScratchPadNormViewType mask(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices);
+  ScratchPadNormViewType tmp(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices);
+
+  TeamCopy<MemberType>::invoke(member, _X, X);
+  // Deep copy of b into r_0:
+  TeamCopy<MemberType>::invoke(member, _B, R);
+
+  // r_0 := b - A x_0
+  member.team_barrier();
+  A.template apply<Trans::NoTranspose, Mode::Team>(member, X, R, -1, 1);
+  member.team_barrier();
+
+  // Deep copy of r_0 into p_0:
+  TeamCopy<MemberType>::invoke(member, R, P);
+
+  TeamDot<MemberType>::invoke(member, R, R, sqr_norm_0);
+  member.team_barrier();
+
+  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
+                       [&](const OrdinalType& i) {
+                         mask(i) =
+                             sqr_norm_0(i) > tolerance * tolerance ? 1. : 0;
+                       });
+
+  TeamCopy1D::invoke(member, sqr_norm_0, sqr_norm_j);
+
+  int status               = 1;
+  int number_not_converged = 0;
+
+  for (size_t j = 0; j < maximum_iteration; ++j) {
+    // q := A p_j
+    A.template apply<Trans::NoTranspose, Mode::Team>(member, P, Q);
     member.team_barrier();
-    A.template apply<MemberType, ScratchPadVectorViewType,
-                     ScratchPadVectorViewType, Trans::NoTranspose, Mode::Team>(
-        member, X, R, -1, 1);
+
+    TeamDot<MemberType>::invoke(member, P, Q, tmp);
     member.team_barrier();
 
-    // Deep copy of r_0 into p_0:
-    TeamCopy<MemberType>::invoke(member, R, P);
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
+                         [&](const OrdinalType& i) {
+                           alpha(i) =
+                               mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.;
+                         });
+    member.team_barrier();
 
-    TeamDot<MemberType>::invoke(member, R, R, sqr_norm_0);
+    // x_{j+1} := alpha p_j + x_j
+    TeamAxpy<MemberType>::invoke(member, alpha, P, X);
+    member.team_barrier();
+
+    // r_{j+1} := - alpha q + r_j
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
+                         [&](const OrdinalType& i) { alpha(i) = -alpha(i); });
+    member.team_barrier();
+
+    TeamAxpy<MemberType>::invoke(member, alpha, Q, R);
+    member.team_barrier();
+
+    TeamDot<MemberType>::invoke(member, R, R, tmp);
     member.team_barrier();
 
     Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
                          [&](const OrdinalType& i) {
-                           mask(i) =
-                               sqr_norm_0(i) > tolerance * tolerance ? 1. : 0;
+                           alpha(i) =
+                               mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.;
                          });
 
-    TeamCopy1D::invoke(member, sqr_norm_0, sqr_norm_j);
-
-    int status               = 1;
-    int number_not_converged = 0;
-
-    for (size_t j = 0; j < maximum_iteration; ++j) {
-      // q := A p_j
-      A.template apply<MemberType, ScratchPadVectorViewType,
-                       ScratchPadVectorViewType, Trans::NoTranspose,
-                       Mode::Team>(member, P, Q);
-      member.team_barrier();
-
-      TeamDot<MemberType>::invoke(member, P, Q, tmp);
-      member.team_barrier();
-
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
-                           [&](const OrdinalType& i) {
-                             alpha(i) =
-                                 mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.;
-                           });
-      member.team_barrier();
-
-      // x_{j+1} := alpha p_j + x_j
-      TeamAxpy<MemberType>::invoke(member, alpha, P, X);
-      member.team_barrier();
-
-      // r_{j+1} := - alpha q + r_j
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
-                           [&](const OrdinalType& i) { alpha(i) = -alpha(i); });
-      member.team_barrier();
-
-      TeamAxpy<MemberType>::invoke(member, alpha, Q, R);
-      member.team_barrier();
-
-      TeamDot<MemberType>::invoke(member, R, R, tmp);
-      member.team_barrier();
-
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
-                           [&](const OrdinalType& i) {
-                             alpha(i) =
-                                 mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.;
-                           });
-
-      TeamCopy1D::invoke(member, tmp, sqr_norm_j);
-
-      // Relative convergence check:
-      number_not_converged = 0;
-      Kokkos::parallel_reduce(
-          Kokkos::TeamThreadRange(member, 0, numMatrices),
-          [&](const OrdinalType& i, int& lnumber_not_converged) {
-            if (sqr_norm_j(i) / sqr_norm_0(i) > tolerance * tolerance)
-              ++lnumber_not_converged;
-            else
-              mask(i) = 0.;
-          },
-          number_not_converged);
-
-      member.team_barrier();
-
-      if (number_not_converged == 0) {
-        status = 0;
-        break;
-      }
-
-      // p_{j+1} := alpha p_j + r_{j+1}
-      TeamXpay<MemberType>::invoke(member, alpha, R, P);
-      member.team_barrier();
+    TeamCopy1D::invoke(member, tmp, sqr_norm_j);
+
+    // Relative convergence check:
+    number_not_converged = 0;
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(member, 0, numMatrices),
+        [&](const OrdinalType& i, int& lnumber_not_converged) {
+          if (sqr_norm_j(i) / sqr_norm_0(i) > tolerance * tolerance)
+            ++lnumber_not_converged;
+          else
+            mask(i) = 0.;
+        },
+        number_not_converged);
+
+    member.team_barrier();
+
+    if (number_not_converged == 0) {
+      status = 0;
+      break;
     }
 
-    TeamCopy<MemberType>::invoke(member, X, _X);
-    return status;
+    // p_{j+1} := alpha p_j + r_{j+1}
+    TeamXpay<MemberType>::invoke(member, alpha, R, P);
+    member.team_barrier();
   }
-};
+
+  TeamCopy<MemberType>::invoke(member, X, _X);
+  return status;
+}
+
 }  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp
new file mode 100644
index 0000000000..5e4d0aba9b
--- /dev/null
+++ b/src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp
@@ -0,0 +1,328 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_GMRES_SERIAL_IMPL_HPP__
+#define __KOKKOSBATCHED_GMRES_SERIAL_IMPL_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+#include "KokkosBatched_Axpy.hpp"
+#include "KokkosBatched_Copy_Decl.hpp"
+#include "KokkosBatched_Dot.hpp"
+#include "KokkosBatched_Spmv.hpp"
+#include "KokkosBatched_Xpay.hpp"
+#include "KokkosBatched_Givens_Serial_Internal.hpp"
+#include "KokkosBatched_Trsm_Decl.hpp"
+#include "KokkosBatched_Identity.hpp"
+#include "KokkosBatched_Gemv_Decl.hpp"
+
+namespace KokkosBatched {
+
+///
+/// Serial GMRES
+///
+
+template <typename OperatorType, typename VectorViewType,
+          typename PrecOperatorType, typename KrylovHandleType>
+KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A,
+                                               const VectorViewType& _B,
+                                               const VectorViewType& _X,
+                                               const PrecOperatorType& P,
+                                               const KrylovHandleType& handle,
+                                               const int GMRES_id) {
+  typedef int OrdinalType;
+  typedef typename Kokkos::Details::ArithTraits<
+      typename VectorViewType::non_const_value_type>::mag_type MagnitudeType;
+  typedef Kokkos::Details::ArithTraits<MagnitudeType> ATM;
+
+  using SerialCopy1D = SerialCopy<Trans::NoTranspose, 1>;
+  using SerialCopy2D = SerialCopy<Trans::NoTranspose, 2>;
+
+  const OrdinalType numMatrices = _X.extent(0);
+  const OrdinalType numRows     = _X.extent(1);
+
+  size_t maximum_iteration = handle.get_max_iteration() < numRows
+                                 ? handle.get_max_iteration()
+                                 : numRows;
+  const MagnitudeType tolerance     = handle.get_tolerance();
+  const MagnitudeType max_tolerance = handle.get_max_tolerance();
+
+  int n_V      = numRows;
+  int n_H      = maximum_iteration + 1;
+  int n_Givens = 2;
+
+  int offset_V      = 0;
+  int offset_H      = offset_V + n_V;
+  int offset_Givens = offset_H + n_H;
+
+  const int first_matrix = handle.first_index(GMRES_id);
+  const int last_matrix  = handle.last_index(GMRES_id);
+
+  auto V_view = Kokkos::subview(
+      handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix),
+      Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V));
+  auto H_view = Kokkos::subview(
+      handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix),
+      Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H));
+  auto Givens_view = Kokkos::subview(
+      handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix),
+      Kokkos::ALL, Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens));
+
+  int n_G    = maximum_iteration + 1;
+  int n_W    = numRows;
+  int n_mask = 1;
+
+  int offset_G    = 0;
+  int offset_W    = offset_G + n_G;
+  int offset_mask = offset_W + n_W;
+  int offset_tmp  = offset_mask + n_mask;
+
+  auto G    = Kokkos::subview(handle.tmp_view,
+                           Kokkos::make_pair(first_matrix, last_matrix),
+                           Kokkos::make_pair(offset_G, offset_G + n_G));
+  auto W    = Kokkos::subview(handle.tmp_view,
+                           Kokkos::make_pair(first_matrix, last_matrix),
+                           Kokkos::make_pair(offset_W, offset_W + n_W));
+  auto mask = Kokkos::subview(handle.tmp_view,
+                              Kokkos::make_pair(first_matrix, last_matrix),
+                              offset_mask);
+  auto tmp =
+      Kokkos::subview(handle.tmp_view,
+                      Kokkos::make_pair(first_matrix, last_matrix), offset_tmp);
+
+  // Deep copy of b into r_0:
+  SerialCopy2D::invoke(_B, W);
+
+  // r_0 := b - A x_0
+  A.template apply<Trans::NoTranspose>(_X, W, -1, 1);
+
+  P.template apply<Trans::NoTranspose, 1>(W, W);
+
+  SerialDot<Trans::NoTranspose>::invoke(W, W, tmp);
+
+  for (OrdinalType i = 0; i < numMatrices; ++i) {
+    tmp(i) = ATM::sqrt(tmp(i));
+    handle.set_norm(GMRES_id, i, 0, tmp(i));
+    if (tmp(i) > max_tolerance) {
+      mask(i) = 1;
+      G(i, 0) = tmp(i);
+      tmp(i)  = 1. / tmp(i);
+    } else {
+      handle.set_iteration(GMRES_id, i, 0);
+      mask(i) = 0;
+      G(i, 0) = 0.;
+      tmp(i)  = 0.;
+    }
+  }
+
+  auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL);
+  for (OrdinalType iRow = 0; iRow < numRows; ++iRow) {
+    for (OrdinalType iMatrix = 0; iMatrix < numMatrices; ++iMatrix) {
+      V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix);
+    }
+  }
+  int status = 1;
+  // int number_not_converged = 0;
+
+  for (size_t j = 0; j < maximum_iteration; ++j) {
+    // q := A p_j
+    auto V_j = Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL);
+
+    A.template apply<Trans::NoTranspose>(V_j, W);
+
+    P.template apply<Trans::NoTranspose, 1>(W, W);
+
+    if (handle.get_ortho_strategy() == 0) {
+      for (OrdinalType l = 0; l < numMatrices; ++l) {
+        auto W_l   = Kokkos::subview(W, l, Kokkos::ALL);
+        auto V_old = Kokkos::subview(
+            V_view, l, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL);
+        auto H_old =
+            Kokkos::subview(H_view, l, j, Kokkos::make_pair(0, (int)j + 1));
+
+        // Inner products
+        SerialGemv<Trans::NoTranspose, Algo::Gemv::Unblocked>::invoke(
+            1, V_old, W_l, 0, H_old);
+
+        // Update
+        SerialGemv<Trans::Transpose, Algo::Gemv::Unblocked>::invoke(
+            -1, V_old, H_old, 1, W_l);
+      }
+    }
+    if (handle.get_ortho_strategy() == 1) {
+      for (size_t i = 0; i < j + 1; ++i) {
+        auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL);
+        SerialDot<Trans::NoTranspose>::invoke(W, V_i, tmp);
+        SerialCopy1D::invoke(tmp, Kokkos::subview(H_view, Kokkos::ALL, j, i));
+        for (OrdinalType ii = 0; ii < numMatrices; ++ii) tmp(ii) = -tmp(ii);
+
+        SerialAxpy::invoke(tmp, V_i, W);
+      }
+    }
+
+    SerialDot<Trans::NoTranspose>::invoke(W, W, tmp);
+
+    for (OrdinalType i = 0; i < numMatrices; ++i) {
+      H_view(i, j, j + 1) = ATM::sqrt(tmp(i));
+      tmp(i) =
+          H_view(i, j, j + 1) > max_tolerance ? 1. / H_view(i, j, j + 1) : 0.;
+    }
+
+    if (j + 1 < maximum_iteration) {
+      auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL);
+      for (OrdinalType iRow = 0; iRow < numRows; ++iRow) {
+        for (OrdinalType iMatrix = 0; iMatrix < numMatrices; ++iMatrix) {
+          V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix);
+        }
+      }
+    }
+
+    for (OrdinalType l = 0; l < numMatrices; ++l) {
+      // Apply the previous Givens rotations:
+      auto H_j        = Kokkos::subview(H_view, l, j, Kokkos::ALL);
+      auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0);
+      auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1);
+
+      if (mask(l) == 1.) {
+        for (size_t i = 0; i < j; ++i) {
+          auto tmp1  = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1);
+          auto tmp2  = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1);
+          H_j(i)     = tmp1;
+          H_j(i + 1) = tmp2;
+        }
+
+        // Compute the new Givens rotation:
+        Kokkos::pair<typename VectorViewType::non_const_value_type,
+                     typename VectorViewType::non_const_value_type>
+            G_new(1, 0);
+        typename VectorViewType::non_const_value_type alpha = 0;
+        SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha);
+
+        Givens_0_l(j) = G_new.first;
+        Givens_1_l(j) = G_new.second;
+
+        // Apply the new Givens rotation:
+        auto tmp1  = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1);
+        auto tmp2  = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1);
+        H_j(j)     = tmp1;
+        H_j(j + 1) = tmp2;
+
+        G(l, j + 1) = -Givens_1_l(j) * G(l, j);
+        G(l, j) *= Givens_0_l(j);
+      } else {
+        H_j(j)      = 1.;
+        G(l, j + 1) = 0.;
+      }
+
+      auto res_norm = Kokkos::ArithTraits<double>::abs(G(l, j + 1)) / G(l, 0);
+
+      handle.set_norm(GMRES_id, l, j + 1, res_norm);
+
+      if (mask(l) == 1. && res_norm < tolerance) {
+        mask(l)     = 0.;
+        G(l, j + 1) = 0.;
+        handle.set_iteration(GMRES_id, l, j + 1);
+      }
+    }
+
+    bool all_converged = true;
+    for (OrdinalType l = 0; l < numMatrices; ++l)
+      all_converged = (all_converged && mask(l) == 0.);
+    if (all_converged) {
+      maximum_iteration = j + 1;
+      break;
+    }
+  }
+
+  auto first_indices = Kokkos::make_pair(0, (int)maximum_iteration);
+
+  for (OrdinalType l = 0; l < numMatrices; ++l) {
+    auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices);
+    auto B_l = Kokkos::subview(G, l, first_indices);
+
+    SerialTrsm<Side::Left, Uplo::Lower, Trans::Transpose, Diag::NonUnit,
+               Algo::Trsm::Unblocked>::invoke(1, A_l, B_l);
+  }
+
+  if (handle.get_ortho_strategy() == 0) {
+    for (OrdinalType l = 0; l < numMatrices; ++l) {
+      SerialGemv<Trans::Transpose, Algo::Gemv::Unblocked>::invoke(
+          1, Kokkos::subview(V_view, l, first_indices, Kokkos::ALL),
+          Kokkos::subview(G, l, first_indices), 1,
+          Kokkos::subview(_X, l, Kokkos::ALL));
+    }
+  }
+  if (handle.get_ortho_strategy() == 1) {
+    for (size_t j = 0; j < maximum_iteration; ++j) {
+      SerialAxpy::invoke(Kokkos::subview(G, Kokkos::ALL, j),
+                         Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL),
+                         _X);
+    }
+  }
+
+  if (handle.get_compute_last_residual()) {
+    SerialCopy2D::invoke(_B, W);
+    A.template apply<Trans::NoTranspose>(_X, W, -1, 1);
+    P.template apply<Trans::NoTranspose, 1>(W, W);
+    SerialDot<Trans::NoTranspose>::invoke(W, W, tmp);
+
+    for (OrdinalType i = 0; i < numMatrices; ++i) {
+      tmp(i) = ATM::sqrt(tmp(i));
+      handle.set_last_norm(GMRES_id, i, tmp(i));
+    }
+  }
+  return status;
+}
+
+template <typename OperatorType, typename VectorViewType,
+          typename KrylovHandleType>
+KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A,
+                                               const VectorViewType& _B,
+                                               const VectorViewType& _X,
+                                               const KrylovHandleType& handle) {
+  Identity P;
+  return invoke<OperatorType, VectorViewType, Identity>(A, _B, _X, P, handle);
+}
+}  // namespace KokkosBatched
+
+#endif
diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp
index 8e45b97556..7fdf244fa7 100644
--- a/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp
+++ b/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp
@@ -54,6 +54,7 @@
 #include "KokkosBatched_Givens_Serial_Internal.hpp"
 #include "KokkosBatched_Trsm_Decl.hpp"
 #include "KokkosBatched_Identity.hpp"
+#include "KokkosBatched_Gemv_Decl.hpp"
 
 namespace KokkosBatched {
 
@@ -64,125 +65,159 @@ namespace KokkosBatched {
 ///
 
 template <typename MemberType>
-struct TeamVectorGMRES {
-  template <typename OperatorType, typename VectorViewType,
-            typename PrecOperatorType>
-  KOKKOS_INLINE_FUNCTION static int invoke(
-      const MemberType& member, const OperatorType& A, const VectorViewType& _B,
-      const VectorViewType& _X, const PrecOperatorType& P,
-      const KrylovHandle<typename VectorViewType::non_const_value_type>&
-          handle) {
-    typedef int OrdinalType;
-    typedef typename Kokkos::Details::ArithTraits<
-        typename VectorViewType::non_const_value_type>::mag_type MagnitudeType;
-    typedef Kokkos::Details::ArithTraits<MagnitudeType> ATM;
-
-    using ScratchPadNormViewType = Kokkos::View<
-        MagnitudeType*,
-        typename VectorViewType::execution_space::scratch_memory_space>;
-    using ScratchPadVectorViewType = Kokkos::View<
-        typename VectorViewType::non_const_value_type**,
-        typename VectorViewType::array_layout,
-        typename VectorViewType::execution_space::scratch_memory_space>;
-    using ScratchPadMultiVectorViewType = Kokkos::View<
-        typename VectorViewType::non_const_value_type***,
-        typename VectorViewType::array_layout,
-        typename VectorViewType::execution_space::scratch_memory_space>;
-    using TeamVectorCopy1D = TeamVectorCopy<MemberType, Trans::NoTranspose, 1>;
-
-    const OrdinalType numMatrices = _X.extent(0);
-    const OrdinalType numRows     = _X.extent(1);
-
-    size_t maximum_iteration = handle.get_max_iteration() < numRows
-                                   ? handle.get_max_iteration()
-                                   : numRows;
-    const MagnitudeType tolerance     = handle.get_tolerance();
-    const MagnitudeType max_tolerance = 0.;
-
-    ScratchPadMultiVectorViewType V(member.team_scratch(1), numMatrices,
-                                    maximum_iteration + 1, numRows);
-    ScratchPadMultiVectorViewType H(member.team_scratch(1), numMatrices,
-                                    maximum_iteration + 1, maximum_iteration);
-    ScratchPadMultiVectorViewType Givens(member.team_scratch(1), numMatrices,
-                                         maximum_iteration, 2);
-    ScratchPadVectorViewType G(member.team_scratch(1), numMatrices,
-                               maximum_iteration + 1);
-
-    ScratchPadVectorViewType W(member.team_scratch(0), numMatrices, numRows);
-    ScratchPadVectorViewType Q(member.team_scratch(0), numMatrices, numRows);
-    ScratchPadVectorViewType R(member.team_scratch(0), numMatrices, numRows);
-    ScratchPadVectorViewType X(member.team_scratch(0), numMatrices, numRows);
-
-    ScratchPadNormViewType beta(member.team_scratch(0), numMatrices);
-    ScratchPadNormViewType mask(member.team_scratch(0), numMatrices);
-    ScratchPadNormViewType tmp(member.team_scratch(0), numMatrices);
-
-    TeamVectorCopy<MemberType>::invoke(member, _X, X);
-    // Deep copy of b into r_0:
-    TeamVectorCopy<MemberType>::invoke(member, _B, R);
-
-    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
-                         [&](const OrdinalType& i) { mask(i) = 1.; });
-
-    // r_0 := b - A x_0
-    member.team_barrier();
-    A.template apply<MemberType, ScratchPadVectorViewType,
-                     ScratchPadVectorViewType, Trans::NoTranspose,
-                     Mode::TeamVector>(member, X, R, -1, 1);
+template <typename OperatorType, typename VectorViewType,
+          typename PrecOperatorType, typename KrylovHandleType>
+KOKKOS_INLINE_FUNCTION int TeamVectorGMRES<MemberType>::invoke(
+    const MemberType& member, const OperatorType& A, const VectorViewType& _B,
+    const VectorViewType& _X, const PrecOperatorType& P,
+    const KrylovHandleType& handle) {
+  typedef int OrdinalType;
+  typedef typename Kokkos::Details::ArithTraits<
+      typename VectorViewType::non_const_value_type>::mag_type MagnitudeType;
+  typedef Kokkos::Details::ArithTraits<MagnitudeType> ATM;
+
+  using ScratchPadVectorViewType = Kokkos::View<
+      typename VectorViewType::non_const_value_type**,
+      typename VectorViewType::array_layout,
+      typename VectorViewType::execution_space::scratch_memory_space>;
+  using TeamVectorCopy1D = TeamVectorCopy<MemberType, Trans::NoTranspose, 1>;
+
+  const OrdinalType numMatrices = _X.extent(0);
+  const OrdinalType numRows     = _X.extent(1);
+
+  size_t maximum_iteration = handle.get_max_iteration() < numRows
+                                 ? handle.get_max_iteration()
+                                 : numRows;
+  const MagnitudeType tolerance     = handle.get_tolerance();
+  const MagnitudeType max_tolerance = handle.get_max_tolerance();
+
+  int n_V      = numRows;
+  int n_H      = maximum_iteration + 1;
+  int n_Givens = 2;
+
+  int offset_V      = 0;
+  int offset_H      = offset_V + n_V;
+  int offset_Givens = offset_H + n_H;
+
+  const int first_matrix = handle.first_index(member.league_rank());
+  const int last_matrix  = handle.last_index(member.league_rank());
+
+  auto V_view = Kokkos::subview(
+      handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix),
+      Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V));
+  auto H_view = Kokkos::subview(
+      handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix),
+      Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H));
+  auto Givens_view = Kokkos::subview(
+      handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix),
+      Kokkos::ALL, Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens));
+
+  int n_G    = maximum_iteration + 1;
+  int n_W    = numRows;
+  int n_X    = numRows;
+  int n_mask = 1;
+  int n_tmp  = 1;
+
+  int offset_G    = 0;
+  int offset_W    = offset_G + n_G;
+  int offset_X    = offset_W + n_W;
+  int offset_mask = offset_X + n_X;
+  int offset_tmp  = offset_mask + n_mask;
+
+  ScratchPadVectorViewType tmp_2D(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices,
+      n_G + n_W + n_X + n_mask + n_tmp);
+
+  auto G    = Kokkos::subview(tmp_2D, Kokkos::ALL,
+                           Kokkos::make_pair(offset_G, offset_G + n_G));
+  auto W    = Kokkos::subview(tmp_2D, Kokkos::ALL,
+                           Kokkos::make_pair(offset_W, offset_W + n_W));
+  auto X    = Kokkos::subview(tmp_2D, Kokkos::ALL,
+                           Kokkos::make_pair(offset_X, offset_X + n_X));
+  auto mask = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_mask);
+  auto tmp  = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_tmp);
+
+  TeamVectorCopy<MemberType>::invoke(member, _X, X);
+  // Deep copy of b into r_0:
+  TeamVectorCopy<MemberType>::invoke(member, _B, W);
+
+  // r_0 := b - A x_0
+  member.team_barrier();
+  A.template apply<Trans::NoTranspose, Mode::TeamVector>(member, X, W, -1, 1);
+  member.team_barrier();
+
+  P.template apply<Trans::NoTranspose, Mode::TeamVector, 1>(member, W, W);
+  member.team_barrier();
+
+  TeamVectorDot<MemberType>::invoke(member, W, W, tmp);
+  member.team_barrier();
+
+  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
+                       [&](const OrdinalType& i) {
+                         tmp(i) = ATM::sqrt(tmp(i));
+                         handle.set_norm(member.league_rank(), i, 0, tmp(i));
+                         if (tmp(i) > max_tolerance) {
+                           mask(i) = 1;
+                           G(i, 0) = tmp(i);
+                           tmp(i)  = 1. / tmp(i);
+                         } else {
+                           handle.set_iteration(member.league_rank(), i, 0);
+                           mask(i) = 0;
+                           G(i, 0) = 0.;
+                           tmp(i)  = 0.;
+                         }
+                       });
+
+  member.team_barrier();  // Finish writing to tmp
+
+  auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL);
+  Kokkos::parallel_for(
+      Kokkos::TeamVectorRange(member, 0, numMatrices * numRows),
+      [&](const OrdinalType& iTemp) {
+        OrdinalType iRow, iMatrix;
+        getIndices<OrdinalType, typename VectorViewType::array_layout>(
+            iTemp, numRows, numMatrices, iRow, iMatrix);
+        V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix);
+      });
+  int status = 1;
+  // int number_not_converged = 0;
+
+  for (size_t j = 0; j < maximum_iteration; ++j) {
+    member.team_barrier();  // Finish writing to V
+    // q := A p_j
+    auto V_j = Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL);
+
+    A.template apply<Trans::NoTranspose, Mode::TeamVector>(member, V_j, W);
     member.team_barrier();
 
-    P.template apply<MemberType, ScratchPadVectorViewType,
-                     ScratchPadVectorViewType, Trans::NoTranspose,
-                     Mode::TeamVector, 1>(member, R, R);
+    P.template apply<Trans::NoTranspose, Mode::TeamVector, 1>(member, W, W);
     member.team_barrier();
 
-    TeamVectorDot<MemberType>::invoke(member, R, R, beta);
-    member.team_barrier();
-
-    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
-                         [&](const OrdinalType& i) {
-                           beta(i) = ATM::sqrt(beta(i));
-                           G(i, 0) = beta(i) > max_tolerance ? beta(i) : 0.;
-                           tmp(i) = beta(i) > max_tolerance ? 1. / beta(i) : 0.;
-                         });
-
-    member.team_barrier();  // Finish writing to tmp
-
-    Kokkos::parallel_for(
-        Kokkos::TeamVectorRange(member, 0, numMatrices * numRows),
-        [&](const OrdinalType& iTemp) {
-          OrdinalType iRow, iMatrix;
-          getIndices<OrdinalType, typename VectorViewType::array_layout>(
-              iTemp, numRows, numMatrices, iRow, iMatrix);
-          V(iMatrix, 0, iRow) = R(iMatrix, iRow) * tmp(iMatrix);
-        });
-
-    int status = 1;
-    // int number_not_converged = 0;
-
-    for (size_t j = 0; j < maximum_iteration; ++j) {
-      member.team_barrier();  // Finish writing to V
-      // q := A p_j
-      auto V_j = Kokkos::subview(V, Kokkos::ALL, j, Kokkos::ALL);
-
-      A.template apply<MemberType, ScratchPadVectorViewType,
-                       ScratchPadVectorViewType, Trans::NoTranspose,
-                       Mode::TeamVector>(member, V_j, W);
+    if (handle.get_ortho_strategy() == 0) {
+      auto V_old = Kokkos::subview(
+          V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL);
+      auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j,
+                                   Kokkos::make_pair(0, (int)j + 1));
+      // Inner products
+      TeamVectorGemv<MemberType, Trans::NoTranspose,
+                     Algo::Gemv::Unblocked>::invoke(member, 1, V_old, W, 0,
+                                                    H_old);
       member.team_barrier();
-      P.template apply<MemberType, ScratchPadVectorViewType,
-                       ScratchPadVectorViewType, Trans::NoTranspose,
-                       Mode::TeamVector, 1>(member, W, W);
 
+      // Update
+      TeamVectorGemv<MemberType, Trans::Transpose,
+                     Algo::Gemv::Unblocked>::invoke(member, -1, V_old, H_old, 1,
+                                                    W);
+      member.team_barrier();  // Finish writing to W
+    }
+    if (handle.get_ortho_strategy() == 1) {
       for (size_t i = 0; i < j + 1; ++i) {
-        member.team_barrier();  // Finish writing to W
-        auto V_i = Kokkos::subview(V, Kokkos::ALL, i, Kokkos::ALL);
+        auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL);
         TeamVectorDot<MemberType>::invoke(member, W, V_i, tmp);
         member.team_barrier();
         TeamVectorCopy1D::invoke(member, tmp,
-                                 Kokkos::subview(H, Kokkos::ALL, i, j));
-
-        member.team_barrier();  // Don't start modifying tmp until copy above
-                                // finishes
+                                 Kokkos::subview(H_view, Kokkos::ALL, j, i));
+        member.team_barrier();
         Kokkos::parallel_for(
             Kokkos::TeamVectorRange(member, 0, numMatrices),
             [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); });
@@ -190,117 +225,161 @@ struct TeamVectorGMRES {
         member.team_barrier();  // Finish writing to tmp
 
         TeamVectorAxpy<MemberType>::invoke(member, tmp, V_i, W);
+        member.team_barrier();  // Finish writing to W
       }
+    }
 
-      member.team_barrier();  // Finish writing to W
-      TeamVectorDot<MemberType>::invoke(member, W, W, tmp);
-      member.team_barrier();
-      Kokkos::parallel_for(
-          Kokkos::TeamVectorRange(member, 0, numMatrices),
-          [&](const OrdinalType& i) {
-            H(i, j + 1, j) = ATM::sqrt(tmp(i));
-            tmp(i) = H(i, j + 1, j) > max_tolerance ? 1. / H(i, j + 1, j) : 0.;
-          });
-      member.team_barrier();
+    TeamVectorDot<MemberType>::invoke(member, W, W, tmp);
+    member.team_barrier();
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
+                         [&](const OrdinalType& i) {
+                           H_view(i, j, j + 1) = ATM::sqrt(tmp(i));
+                           tmp(i) = H_view(i, j, j + 1) > max_tolerance
+                                        ? 1. / H_view(i, j, j + 1)
+                                        : 0.;
+                         });
+    member.team_barrier();
+    if (j + 1 < maximum_iteration) {
+      auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL);
       Kokkos::parallel_for(
           Kokkos::TeamVectorRange(member, 0, numMatrices * numRows),
           [&](const OrdinalType& iTemp) {
             OrdinalType iRow, iMatrix;
             getIndices<OrdinalType, typename VectorViewType::array_layout>(
                 iTemp, numRows, numMatrices, iRow, iMatrix);
-            V(iMatrix, j + 1, iRow) = W(iMatrix, iRow) * tmp(iMatrix);
-          });
-
-      Kokkos::parallel_for(
-          Kokkos::TeamVectorRange(member, 0, numMatrices),
-          [&](const OrdinalType& l) {
-            // Apply the previous Givens rotations:
-            auto H_j = Kokkos::subview(H, l, Kokkos::ALL, j);
-
-            if (mask(l) == 1.) {
-              for (size_t i = 0; i < j; ++i) {
-                auto tmp1 =
-                    Givens(l, i, 0) * H_j(i) + Givens(l, i, 1) * H_j(i + 1);
-                auto tmp2 =
-                    -Givens(l, i, 1) * H_j(i) + Givens(l, i, 0) * H_j(i + 1);
-                H_j(i)     = tmp1;
-                H_j(i + 1) = tmp2;
-              }
-
-              // Compute the new Givens rotation:
-              Kokkos::pair<typename VectorViewType::non_const_value_type,
-                           typename VectorViewType::non_const_value_type>
-                  G_new(1, 0);
-              typename VectorViewType::non_const_value_type alpha = 0;
-              SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha);
-
-              Givens(l, j, 0) = G_new.first;
-              Givens(l, j, 1) = G_new.second;
-
-              // Apply the new Givens rotation:
-              auto tmp1 =
-                  Givens(l, j, 0) * H_j(j) + Givens(l, j, 1) * H_j(j + 1);
-              auto tmp2 =
-                  -Givens(l, j, 1) * H_j(j) + Givens(l, j, 0) * H_j(j + 1);
-              H_j(j)     = tmp1;
-              H_j(j + 1) = tmp2;
-
-              G(l, j + 1) = -Givens(l, j, 1) * G(l, j);
-              G(l, j) *= Givens(l, j, 0);
-            } else {
-              H_j(j)      = 1.;
-              G(l, j + 1) = 0.;
-            }
-
-            if (mask(l) == 1. &&
-                Kokkos::ArithTraits<double>::abs(G(l, j + 1)) / beta(l) <
-                    tolerance) {
-              mask(l)     = 0.;
-              G(l, j + 1) = 0.;
-            }
+            V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix);
           });
+      member.team_barrier();
     }
 
-    member.team_barrier();  // Finish writing to G
-
     Kokkos::parallel_for(
         Kokkos::TeamVectorRange(member, 0, numMatrices),
         [&](const OrdinalType& l) {
-          SerialTrsm<Side::Left, Uplo::Upper, Trans::NoTranspose, Diag::NonUnit,
-                     Algo::Trsm::Unblocked>::template invoke(1,
-                                                             Kokkos::subview(
-                                                                 H, l,
-                                                                 Kokkos::ALL,
-                                                                 Kokkos::ALL),
-                                                             Kokkos::subview(
-                                                                 G, l,
-                                                                 Kokkos::ALL));
+          // Apply the previous Givens rotations:
+          auto H_j        = Kokkos::subview(H_view, l, j, Kokkos::ALL);
+          auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0);
+          auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1);
+
+          if (mask(l) == 1.) {
+            for (size_t i = 0; i < j; ++i) {
+              auto tmp1  = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1);
+              auto tmp2  = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1);
+              H_j(i)     = tmp1;
+              H_j(i + 1) = tmp2;
+            }
+
+            // Compute the new Givens rotation:
+            Kokkos::pair<typename VectorViewType::non_const_value_type,
+                         typename VectorViewType::non_const_value_type>
+                G_new(1, 0);
+            typename VectorViewType::non_const_value_type alpha = 0;
+            SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha);
+
+            Givens_0_l(j) = G_new.first;
+            Givens_1_l(j) = G_new.second;
+
+            // Apply the new Givens rotation:
+            auto tmp1  = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1);
+            auto tmp2  = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1);
+            H_j(j)     = tmp1;
+            H_j(j + 1) = tmp2;
+
+            G(l, j + 1) = -Givens_1_l(j) * G(l, j);
+            G(l, j) *= Givens_0_l(j);
+          } else {
+            H_j(j)      = 1.;
+            G(l, j + 1) = 0.;
+          }
+
+          auto res_norm =
+              Kokkos::ArithTraits<double>::abs(G(l, j + 1)) / G(l, 0);
+
+          handle.set_norm(member.league_rank(), l, j + 1, res_norm);
+
+          if (mask(l) == 1. && res_norm < tolerance) {
+            mask(l)     = 0.;
+            G(l, j + 1) = 0.;
+            handle.set_iteration(member.league_rank(), l, j + 1);
+          }
         });
+    member.team_barrier();
+
+    bool all_converged = true;
+    for (OrdinalType l = 0; l < numMatrices; ++l)
+      all_converged = (all_converged && mask(l) == 0.);
+    if (all_converged) {
+      maximum_iteration = j + 1;
+      break;
+    }
+  }
+
+  member.team_barrier();  // Finish writing to G
+
+  auto first_indices = Kokkos::make_pair(0, (int)maximum_iteration);
+
+  Kokkos::parallel_for(
+      Kokkos::TeamVectorRange(member, 0, numMatrices),
+      [&](const OrdinalType& l) {
+        auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices);
+        auto B_l = Kokkos::subview(G, l, first_indices);
 
-    member.team_barrier();  // Finish writing to G
+        SerialTrsm<Side::Left, Uplo::Lower, Trans::Transpose, Diag::NonUnit,
+                   Algo::Trsm::Unblocked>::invoke(1, A_l, B_l);
+      });
 
+  member.team_barrier();  // Finish writing to G
+
+  if (handle.get_ortho_strategy() == 0) {
+    TeamVectorGemv<MemberType, Trans::Transpose, Algo::Gemv::Unblocked>::invoke(
+        member, 1,
+        Kokkos::subview(V_view, Kokkos::ALL, first_indices, Kokkos::ALL),
+        Kokkos::subview(G, Kokkos::ALL, first_indices), 1, X);
+    member.team_barrier();  // Finish writing to X
+  }
+  if (handle.get_ortho_strategy() == 1) {
     for (size_t j = 0; j < maximum_iteration; ++j) {
       TeamVectorAxpy<MemberType>::invoke(
           member, Kokkos::subview(G, Kokkos::ALL, j),
-          Kokkos::subview(V, Kokkos::ALL, j, Kokkos::ALL), X);
+          Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), X);
       member.team_barrier();  // Finish writing to X
     }
-
-    TeamVectorCopy<MemberType>::invoke(member, X, _X);
-    return status;
   }
 
-  template <typename OperatorType, typename VectorViewType>
-  KOKKOS_INLINE_FUNCTION static int invoke(
-      const MemberType& member, const OperatorType& A, const VectorViewType& _B,
-      const VectorViewType& _X,
-      const KrylovHandle<typename VectorViewType::non_const_value_type>&
-          handle) {
-    Identity P;
-    return invoke<OperatorType, VectorViewType, Identity>(member, A, _B, _X, P,
-                                                          handle);
+  TeamVectorCopy<MemberType>::invoke(member, X, _X);
+
+  member.team_barrier();
+
+  if (handle.get_compute_last_residual()) {
+    TeamVectorCopy<MemberType>::invoke(member, _B, W);
+    member.team_barrier();
+    A.template apply<Trans::NoTranspose, Mode::TeamVector>(member, X, W, -1, 1);
+    member.team_barrier();
+    P.template apply<Trans::NoTranspose, Mode::TeamVector, 1>(member, W, W);
+    member.team_barrier();
+    TeamVectorDot<MemberType>::invoke(member, W, W, tmp);
+    member.team_barrier();
+
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
+                         [&](const OrdinalType& i) {
+                           tmp(i) = ATM::sqrt(tmp(i));
+                           handle.set_last_norm(member.league_rank(), i,
+                                                tmp(i));
+                         });
   }
-};
+  return status;
+}
+
+template <typename MemberType>
+template <typename OperatorType, typename VectorViewType,
+          typename KrylovHandleType>
+KOKKOS_INLINE_FUNCTION int TeamVectorGMRES<MemberType>::invoke(
+    const MemberType& member, const OperatorType& A, const VectorViewType& _B,
+    const VectorViewType& _X, const KrylovHandleType& handle) {
+  Identity P;
+  return invoke<OperatorType, VectorViewType, Identity>(member, A, _B, _X, P,
+                                                        handle);
+}
+
 }  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp
index 4b4bd06bc0..41ac90e61d 100644
--- a/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp
+++ b/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp
@@ -54,6 +54,7 @@
 #include "KokkosBatched_Givens_Serial_Internal.hpp"
 #include "KokkosBatched_Trsm_Decl.hpp"
 #include "KokkosBatched_Identity.hpp"
+#include "KokkosBatched_Gemv_Decl.hpp"
 
 namespace KokkosBatched {
 
@@ -63,123 +64,157 @@ namespace KokkosBatched {
 ///
 
 template <typename MemberType>
-struct TeamGMRES {
-  template <typename OperatorType, typename VectorViewType,
-            typename PrecOperatorType>
-  KOKKOS_INLINE_FUNCTION static int invoke(
-      const MemberType& member, const OperatorType& A, const VectorViewType& _B,
-      const VectorViewType& _X, const PrecOperatorType& P,
-      const KrylovHandle<typename VectorViewType::non_const_value_type>&
-          handle) {
-    typedef int OrdinalType;
-    typedef typename Kokkos::Details::ArithTraits<
-        typename VectorViewType::non_const_value_type>::mag_type MagnitudeType;
-    typedef Kokkos::Details::ArithTraits<MagnitudeType> ATM;
-
-    using ScratchPadNormViewType = Kokkos::View<
-        MagnitudeType*,
-        typename VectorViewType::execution_space::scratch_memory_space>;
-    using ScratchPadVectorViewType = Kokkos::View<
-        typename VectorViewType::non_const_value_type**,
-        typename VectorViewType::array_layout,
-        typename VectorViewType::execution_space::scratch_memory_space>;
-    using ScratchPadMultiVectorViewType = Kokkos::View<
-        typename VectorViewType::non_const_value_type***,
-        typename VectorViewType::array_layout,
-        typename VectorViewType::execution_space::scratch_memory_space>;
-    using TeamCopy1D = TeamCopy<MemberType, Trans::NoTranspose, 1>;
-
-    const OrdinalType numMatrices = _X.extent(0);
-    const OrdinalType numRows     = _X.extent(1);
-
-    size_t maximum_iteration = handle.get_max_iteration() < numRows
-                                   ? handle.get_max_iteration()
-                                   : numRows;
-    const MagnitudeType tolerance     = handle.get_tolerance();
-    const MagnitudeType max_tolerance = 0.;
-
-    ScratchPadMultiVectorViewType V(member.team_scratch(1), numMatrices,
-                                    maximum_iteration + 1, numRows);
-    ScratchPadMultiVectorViewType H(member.team_scratch(1), numMatrices,
-                                    maximum_iteration + 1, maximum_iteration);
-    ScratchPadMultiVectorViewType Givens(member.team_scratch(1), numMatrices,
-                                         maximum_iteration, 2);
-    ScratchPadVectorViewType G(member.team_scratch(1), numMatrices,
-                               maximum_iteration + 1);
-
-    ScratchPadVectorViewType W(member.team_scratch(0), numMatrices, numRows);
-    ScratchPadVectorViewType Q(member.team_scratch(0), numMatrices, numRows);
-    ScratchPadVectorViewType R(member.team_scratch(0), numMatrices, numRows);
-    ScratchPadVectorViewType X(member.team_scratch(0), numMatrices, numRows);
-
-    ScratchPadNormViewType beta(member.team_scratch(0), numMatrices);
-    ScratchPadNormViewType mask(member.team_scratch(0), numMatrices);
-    ScratchPadNormViewType tmp(member.team_scratch(0), numMatrices);
-
-    TeamCopy<MemberType>::invoke(member, _X, X);
-    // Deep copy of b into r_0:
-    TeamCopy<MemberType>::invoke(member, _B, R);
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
-                         [&](const OrdinalType& i) { mask(i) = 1.; });
-
-    // r_0 := b - A x_0
-    member.team_barrier();
-    A.template apply<MemberType, ScratchPadVectorViewType,
-                     ScratchPadVectorViewType, Trans::NoTranspose, Mode::Team>(
-        member, X, R, -1, 1);
+template <typename OperatorType, typename VectorViewType,
+          typename PrecOperatorType, typename KrylovHandleType>
+KOKKOS_INLINE_FUNCTION int TeamGMRES<MemberType>::invoke(
+    const MemberType& member, const OperatorType& A, const VectorViewType& _B,
+    const VectorViewType& _X, const PrecOperatorType& P,
+    const KrylovHandleType& handle) {
+  typedef int OrdinalType;
+  typedef typename Kokkos::Details::ArithTraits<
+      typename VectorViewType::non_const_value_type>::mag_type MagnitudeType;
+  typedef Kokkos::Details::ArithTraits<MagnitudeType> ATM;
+
+  using ScratchPadVectorViewType = Kokkos::View<
+      typename VectorViewType::non_const_value_type**,
+      typename VectorViewType::array_layout,
+      typename VectorViewType::execution_space::scratch_memory_space>;
+  using TeamCopy1D = TeamCopy<MemberType, Trans::NoTranspose, 1>;
+
+  const OrdinalType numMatrices = _X.extent(0);
+  const OrdinalType numRows     = _X.extent(1);
+
+  size_t maximum_iteration = handle.get_max_iteration() < numRows
+                                 ? handle.get_max_iteration()
+                                 : numRows;
+  const MagnitudeType tolerance     = handle.get_tolerance();
+  const MagnitudeType max_tolerance = handle.get_max_tolerance();
+
+  int n_V      = numRows;
+  int n_H      = maximum_iteration + 1;
+  int n_Givens = 2;
+
+  int offset_V      = 0;
+  int offset_H      = offset_V + n_V;
+  int offset_Givens = offset_H + n_H;
+
+  const int first_matrix = handle.first_index(member.league_rank());
+  const int last_matrix  = handle.last_index(member.league_rank());
+
+  auto V_view = Kokkos::subview(
+      handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix),
+      Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V));
+  auto H_view = Kokkos::subview(
+      handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix),
+      Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H));
+  auto Givens_view = Kokkos::subview(
+      handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix),
+      Kokkos::ALL, Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens));
+
+  int n_G    = maximum_iteration + 1;
+  int n_W    = numRows;
+  int n_X    = numRows;
+  int n_mask = 1;
+  int n_tmp  = 1;
+
+  int offset_G    = 0;
+  int offset_W    = offset_G + n_G;
+  int offset_X    = offset_W + n_W;
+  int offset_mask = offset_X + n_X;
+  int offset_tmp  = offset_mask + n_mask;
+
+  ScratchPadVectorViewType tmp_2D(
+      member.team_scratch(handle.get_scratch_pad_level()), numMatrices,
+      n_G + n_W + n_X + n_mask + n_tmp);
+
+  auto G    = Kokkos::subview(tmp_2D, Kokkos::ALL,
+                           Kokkos::make_pair(offset_G, offset_G + n_G));
+  auto W    = Kokkos::subview(tmp_2D, Kokkos::ALL,
+                           Kokkos::make_pair(offset_W, offset_W + n_W));
+  auto X    = Kokkos::subview(tmp_2D, Kokkos::ALL,
+                           Kokkos::make_pair(offset_X, offset_X + n_X));
+  auto mask = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_mask);
+  auto tmp  = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_tmp);
+
+  TeamCopy<MemberType>::invoke(member, _X, X);
+  // Deep copy of b into r_0:
+  TeamCopy<MemberType>::invoke(member, _B, W);
+
+  // r_0 := b - A x_0
+  member.team_barrier();
+  A.template apply<Trans::NoTranspose, Mode::Team>(member, X, W, -1, 1);
+  member.team_barrier();
+
+  P.template apply<Trans::NoTranspose, Mode::Team, 1>(member, W, W);
+  member.team_barrier();
+
+  TeamDot<MemberType>::invoke(member, W, W, tmp);
+  member.team_barrier();
+
+  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
+                       [&](const OrdinalType& i) {
+                         tmp(i) = ATM::sqrt(tmp(i));
+                         handle.set_norm(member.league_rank(), i, 0, tmp(i));
+                         if (tmp(i) > max_tolerance) {
+                           mask(i) = 1;
+                           G(i, 0) = tmp(i);
+                           tmp(i)  = 1. / tmp(i);
+                         } else {
+                           handle.set_iteration(member.league_rank(), i, 0);
+                           mask(i) = 0;
+                           G(i, 0) = 0.;
+                           tmp(i)  = 0.;
+                         }
+                       });
+
+  member.team_barrier();  // Finish writing to tmp
+
+  auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL);
+  Kokkos::parallel_for(
+      Kokkos::TeamThreadRange(member, 0, numMatrices * numRows),
+      [&](const OrdinalType& iTemp) {
+        OrdinalType iRow, iMatrix;
+        getIndices<OrdinalType, typename VectorViewType::array_layout>(
+            iTemp, numRows, numMatrices, iRow, iMatrix);
+        V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix);
+      });
+  int status = 1;
+  // int number_not_converged = 0;
+
+  for (size_t j = 0; j < maximum_iteration; ++j) {
+    member.team_barrier();  // Finish writing to V
+    // q := A p_j
+    auto V_j = Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL);
+
+    A.template apply<Trans::NoTranspose, Mode::Team>(member, V_j, W);
     member.team_barrier();
 
-    P.template apply<MemberType, ScratchPadVectorViewType,
-                     ScratchPadVectorViewType, Trans::NoTranspose, Mode::Team,
-                     1>(member, R, R);
+    P.template apply<Trans::NoTranspose, Mode::Team, 1>(member, W, W);
     member.team_barrier();
 
-    TeamDot<MemberType>::invoke(member, R, R, beta);
-    member.team_barrier();
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
-                         [&](const OrdinalType& i) {
-                           beta(i) = ATM::sqrt(beta(i));
-                           G(i, 0) = beta(i) > max_tolerance ? beta(i) : 0.;
-                           tmp(i) = beta(i) > max_tolerance ? 1. / beta(i) : 0.;
-                         });
-
-    member.team_barrier();  // Finish writing to tmp
-
-    Kokkos::parallel_for(
-        Kokkos::TeamThreadRange(member, 0, numMatrices * numRows),
-        [&](const OrdinalType& iTemp) {
-          OrdinalType iRow, iMatrix;
-          getIndices<OrdinalType, typename VectorViewType::array_layout>(
-              iTemp, numRows, numMatrices, iRow, iMatrix);
-          V(iMatrix, 0, iRow) = R(iMatrix, iRow) * tmp(iMatrix);
-        });
-
-    int status = 1;
-    // int number_not_converged = 0;
-
-    for (size_t j = 0; j < maximum_iteration; ++j) {
-      member.team_barrier();  // Finish writing to V
-      // q := A p_j
-      auto V_j = Kokkos::subview(V, Kokkos::ALL, j, Kokkos::ALL);
-
-      A.template apply<MemberType, ScratchPadVectorViewType,
-                       ScratchPadVectorViewType, Trans::NoTranspose,
-                       Mode::Team>(member, V_j, W);
+    if (handle.get_ortho_strategy() == 0) {
+      auto V_old = Kokkos::subview(
+          V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL);
+      auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j,
+                                   Kokkos::make_pair(0, (int)j + 1));
+      // Inner products
+      TeamGemv<MemberType, Trans::NoTranspose, Algo::Gemv::Unblocked>::invoke(
+          member, 1, V_old, W, 0, H_old);
       member.team_barrier();
-      P.template apply<MemberType, ScratchPadVectorViewType,
-                       ScratchPadVectorViewType, Trans::NoTranspose, Mode::Team,
-                       1>(member, W, W);
 
+      // Update
+      TeamGemv<MemberType, Trans::Transpose, Algo::Gemv::Unblocked>::invoke(
+          member, -1, V_old, H_old, 1, W);
+      member.team_barrier();  // Finish writing to W
+    }
+    if (handle.get_ortho_strategy() == 1) {
       for (size_t i = 0; i < j + 1; ++i) {
-        member.team_barrier();  // Finish writing to W
-        auto V_i = Kokkos::subview(V, Kokkos::ALL, i, Kokkos::ALL);
+        auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL);
         TeamDot<MemberType>::invoke(member, W, V_i, tmp);
         member.team_barrier();
-        TeamCopy1D::invoke(member, tmp, Kokkos::subview(H, Kokkos::ALL, i, j));
-        member.team_barrier();  // Don't start modifying tmp until copy above
-                                // finishes
+        TeamCopy1D::invoke(member, tmp,
+                           Kokkos::subview(H_view, Kokkos::ALL, j, i));
+        member.team_barrier();
         Kokkos::parallel_for(
             Kokkos::TeamThreadRange(member, 0, numMatrices),
             [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); });
@@ -187,117 +222,161 @@ struct TeamGMRES {
         member.team_barrier();  // Finish writing to tmp
 
         TeamAxpy<MemberType>::invoke(member, tmp, V_i, W);
+        member.team_barrier();  // Finish writing to W
       }
+    }
 
-      member.team_barrier();  // Finish writing to W
-      TeamDot<MemberType>::invoke(member, W, W, tmp);
-      member.team_barrier();
-      Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(member, 0, numMatrices),
-          [&](const OrdinalType& i) {
-            H(i, j + 1, j) = ATM::sqrt(tmp(i));
-            tmp(i) = H(i, j + 1, j) > max_tolerance ? 1. / H(i, j + 1, j) : 0.;
-          });
-      member.team_barrier();
+    TeamDot<MemberType>::invoke(member, W, W, tmp);
+    member.team_barrier();
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
+                         [&](const OrdinalType& i) {
+                           H_view(i, j, j + 1) = ATM::sqrt(tmp(i));
+                           tmp(i) = H_view(i, j, j + 1) > max_tolerance
+                                        ? 1. / H_view(i, j, j + 1)
+                                        : 0.;
+                         });
+    member.team_barrier();
+    if (j + 1 < maximum_iteration) {
+      auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL);
       Kokkos::parallel_for(
           Kokkos::TeamThreadRange(member, 0, numMatrices * numRows),
           [&](const OrdinalType& iTemp) {
             OrdinalType iRow, iMatrix;
             getIndices<OrdinalType, typename VectorViewType::array_layout>(
                 iTemp, numRows, numMatrices, iRow, iMatrix);
-            V(iMatrix, j + 1, iRow) = W(iMatrix, iRow) * tmp(iMatrix);
-          });
-
-      Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(member, 0, numMatrices),
-          [&](const OrdinalType& l) {
-            // Apply the previous Givens rotations:
-            auto H_j = Kokkos::subview(H, l, Kokkos::ALL, j);
-
-            if (mask(l) == 1.) {
-              for (size_t i = 0; i < j; ++i) {
-                auto tmp1 =
-                    Givens(l, i, 0) * H_j(i) + Givens(l, i, 1) * H_j(i + 1);
-                auto tmp2 =
-                    -Givens(l, i, 1) * H_j(i) + Givens(l, i, 0) * H_j(i + 1);
-                H_j(i)     = tmp1;
-                H_j(i + 1) = tmp2;
-              }
-
-              // Compute the new Givens rotation:
-              Kokkos::pair<typename VectorViewType::non_const_value_type,
-                           typename VectorViewType::non_const_value_type>
-                  G_new(1, 0);
-              typename VectorViewType::non_const_value_type alpha = 0;
-              SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha);
-
-              Givens(l, j, 0) = G_new.first;
-              Givens(l, j, 1) = G_new.second;
-
-              // Apply the new Givens rotation:
-              auto tmp1 =
-                  Givens(l, j, 0) * H_j(j) + Givens(l, j, 1) * H_j(j + 1);
-              auto tmp2 =
-                  -Givens(l, j, 1) * H_j(j) + Givens(l, j, 0) * H_j(j + 1);
-              H_j(j)     = tmp1;
-              H_j(j + 1) = tmp2;
-
-              G(l, j + 1) = -Givens(l, j, 1) * G(l, j);
-              G(l, j) *= Givens(l, j, 0);
-            } else {
-              H_j(j)      = 1.;
-              G(l, j + 1) = 0.;
-            }
-
-            if (mask(l) == 1. &&
-                Kokkos::ArithTraits<double>::abs(G(l, j + 1)) / beta(l) <
-                    tolerance) {
-              mask(l)     = 0.;
-              G(l, j + 1) = 0.;
-            }
+            V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix);
           });
+      member.team_barrier();
     }
 
-    member.team_barrier();  // Finish writing to G
-
     Kokkos::parallel_for(
         Kokkos::TeamThreadRange(member, 0, numMatrices),
         [&](const OrdinalType& l) {
-          SerialTrsm<Side::Left, Uplo::Upper, Trans::NoTranspose, Diag::NonUnit,
-                     Algo::Trsm::Unblocked>::template invoke(1,
-                                                             Kokkos::subview(
-                                                                 H, l,
-                                                                 Kokkos::ALL,
-                                                                 Kokkos::ALL),
-                                                             Kokkos::subview(
-                                                                 G, l,
-                                                                 Kokkos::ALL));
+          // Apply the previous Givens rotations:
+          auto H_j        = Kokkos::subview(H_view, l, j, Kokkos::ALL);
+          auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0);
+          auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1);
+
+          if (mask(l) == 1.) {
+            for (size_t i = 0; i < j; ++i) {
+              auto tmp1  = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1);
+              auto tmp2  = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1);
+              H_j(i)     = tmp1;
+              H_j(i + 1) = tmp2;
+            }
+
+            // Compute the new Givens rotation:
+            Kokkos::pair<typename VectorViewType::non_const_value_type,
+                         typename VectorViewType::non_const_value_type>
+                G_new(1, 0);
+            typename VectorViewType::non_const_value_type alpha = 0;
+            SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha);
+
+            Givens_0_l(j) = G_new.first;
+            Givens_1_l(j) = G_new.second;
+
+            // Apply the new Givens rotation:
+            auto tmp1  = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1);
+            auto tmp2  = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1);
+            H_j(j)     = tmp1;
+            H_j(j + 1) = tmp2;
+
+            G(l, j + 1) = -Givens_1_l(j) * G(l, j);
+            G(l, j) *= Givens_0_l(j);
+          } else {
+            H_j(j)      = 1.;
+            G(l, j + 1) = 0.;
+          }
+
+          auto res_norm =
+              Kokkos::ArithTraits<double>::abs(G(l, j + 1)) / G(l, 0);
+
+          handle.set_norm(member.league_rank(), l, j + 1, res_norm);
+
+          if (mask(l) == 1. && res_norm < tolerance) {
+            mask(l)     = 0.;
+            G(l, j + 1) = 0.;
+            handle.set_iteration(member.league_rank(), l, j + 1);
+          }
         });
+    member.team_barrier();
+
+    bool all_converged = true;
+    for (OrdinalType l = 0; l < numMatrices; ++l)
+      all_converged = (all_converged && mask(l) == 0.);
+    if (all_converged) {
+      maximum_iteration = j + 1;
+      break;
+    }
+  }
+
+  member.team_barrier();  // Finish writing to G
+
+  auto first_indices = Kokkos::make_pair(0, (int)maximum_iteration);
+
+  Kokkos::parallel_for(
+      Kokkos::TeamVectorRange(member, 0, numMatrices),
+      [&](const OrdinalType& l) {
+        auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices);
+        auto B_l = Kokkos::subview(G, l, first_indices);
 
-    member.team_barrier();  // Finish writing to G
+        SerialTrsm<Side::Left, Uplo::Lower, Trans::Transpose, Diag::NonUnit,
+                   Algo::Trsm::Unblocked>::invoke(1, A_l, B_l);
+      });
 
+  member.team_barrier();  // Finish writing to G
+
+  if (handle.get_ortho_strategy() == 0) {
+    TeamGemv<MemberType, Trans::Transpose, Algo::Gemv::Unblocked>::invoke(
+        member, 1,
+        Kokkos::subview(V_view, Kokkos::ALL, first_indices, Kokkos::ALL),
+        Kokkos::subview(G, Kokkos::ALL, first_indices), 1, X);
+    member.team_barrier();  // Finish writing to X
+  }
+  if (handle.get_ortho_strategy() == 1) {
     for (size_t j = 0; j < maximum_iteration; ++j) {
       TeamAxpy<MemberType>::invoke(
           member, Kokkos::subview(G, Kokkos::ALL, j),
-          Kokkos::subview(V, Kokkos::ALL, j, Kokkos::ALL), X);
+          Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), X);
       member.team_barrier();  // Finish writing to X
     }
-
-    TeamCopy<MemberType>::invoke(member, X, _X);
-    return status;
   }
 
-  template <typename OperatorType, typename VectorViewType>
-  KOKKOS_INLINE_FUNCTION static int invoke(
-      const MemberType& member, const OperatorType& A, const VectorViewType& _B,
-      const VectorViewType& _X,
-      const KrylovHandle<typename VectorViewType::non_const_value_type>&
-          handle) {
-    Identity P;
-    return invoke<OperatorType, VectorViewType, Identity>(member, A, _B, _X, P,
-                                                          handle);
+  TeamCopy<MemberType>::invoke(member, X, _X);
+
+  member.team_barrier();
+
+  if (handle.get_compute_last_residual()) {
+    TeamCopy<MemberType>::invoke(member, _B, W);
+    member.team_barrier();
+    A.template apply<Trans::NoTranspose, Mode::Team>(member, X, W, -1, 1);
+    member.team_barrier();
+    P.template apply<Trans::NoTranspose, Mode::Team, 1>(member, W, W);
+    member.team_barrier();
+    TeamDot<MemberType>::invoke(member, W, W, tmp);
+    member.team_barrier();
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
+                         [&](const OrdinalType& i) {
+                           tmp(i) = ATM::sqrt(tmp(i));
+                           handle.set_last_norm(member.league_rank(), i,
+                                                tmp(i));
+                         });
   }
-};
+  return status;
+}
+
+template <typename MemberType>
+template <typename OperatorType, typename VectorViewType,
+          typename KrylovHandleType>
+KOKKOS_INLINE_FUNCTION int TeamGMRES<MemberType>::invoke(
+    const MemberType& member, const OperatorType& A, const VectorViewType& _B,
+    const VectorViewType& _X, const KrylovHandleType& handle) {
+  Identity P;
+  return invoke<OperatorType, VectorViewType, Identity>(member, A, _B, _X, P,
+                                                        handle);
+}
+
 }  // namespace KokkosBatched
 
 #endif
diff --git a/src/blas/KokkosBlas1_axpby.hpp b/src/blas/KokkosBlas1_axpby.hpp
index cae0cc7102..e8b79df565 100644
--- a/src/blas/KokkosBlas1_axpby.hpp
+++ b/src/blas/KokkosBlas1_axpby.hpp
@@ -46,6 +46,7 @@
 #define KOKKOSBLAS1_AXPBY_HPP_
 
 #include <KokkosBlas1_axpby_spec.hpp>
+#include <KokkosBlas_serial_axpy.hpp>
 #include <KokkosKernels_helpers.hpp>
 #include <KokkosKernels_Error.hpp>
 
@@ -124,6 +125,32 @@ void axpy(const AV& a, const XMV& X, const YMV& Y) {
         Y);
 }
 
+///
+/// Serial axpy on device
+///
+template <class scalar_type, class XMV, class YMV>
+KOKKOS_FUNCTION void serial_axpy(const scalar_type alpha, const XMV X, YMV Y) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::serial_axpy: XMV is not a Kokkos::View");
+  static_assert(Kokkos::is_view<YMV>::value,
+                "KokkosBlas::serial_axpy: YMV is not a Kokkos::View");
+  static_assert(XMV::Rank == 1 || XMV::Rank == 2,
+                "KokkosBlas::serial_axpy: XMV must have rank 1 or 2.");
+  static_assert(
+      XMV::Rank == YMV::Rank,
+      "KokkosBlas::serial_axpy: XMV and YMV must have the same rank.");
+
+  if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+    Kokkos::abort("KokkosBlas::serial_axpy: X and Y dimensions do not match");
+  }
+#endif  // KOKKOSKERNELS_DEBUG_LEVEL
+
+  return Impl::serial_axpy_mv(X.extent(0), X.extent(1), alpha, X.data(),
+                              Y.data(), X.stride_0(), X.stride_1(),
+                              Y.stride_0(), Y.stride_1());
+}
+
 }  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/KokkosBlas1_nrm2.hpp b/src/blas/KokkosBlas1_nrm2.hpp
index 3a10e48a4d..bbe231e795 100644
--- a/src/blas/KokkosBlas1_nrm2.hpp
+++ b/src/blas/KokkosBlas1_nrm2.hpp
@@ -46,6 +46,7 @@
 #define KOKKOSBLAS1_NRM2_HPP_
 
 #include <KokkosBlas1_nrm2_spec.hpp>
+#include <KokkosBlas_serial_nrm2.hpp>
 #include <KokkosKernels_helpers.hpp>
 #include <KokkosKernels_Error.hpp>
 
@@ -156,6 +157,63 @@ void nrm2(const RV& R, const XMV& X,
 
   Impl::Nrm2<RV_Internal, XMV_Internal>::nrm2(R_internal, X_internal, true);
 }
+
+///
+/// Serial nrm2
+///
+template <class XMV>
+KOKKOS_INLINE_FUNCTION typename Kokkos::Details::InnerProductSpaceTraits<
+    typename XMV::non_const_value_type>::mag_type
+serial_nrm2(const XMV X) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::serial_nrm2: XMV is not a Kokkos::View");
+  static_assert(XMV::Rank == 1,
+                "KokkosBlas::serial_nrm2: XMV must have rank 1");
+#endif  // KOKKOSKERNELS_DEBUG_LEVEL
+
+  return Impl::serial_nrm2(X.extent(0), X.data(), X.stride_0());
+}
+
+template <class RV, class XMV>
+KOKKOS_INLINE_FUNCTION int serial_nrm2(const XMV X, const RV& R) {
+// Do some compile time check when debug is enabled
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::serial_nrm2: XMV is not a Kokkos::View");
+  static_assert(Kokkos::is_view<RV>::value,
+                "KokkosBlas::serial_nrm2: RV is not a Kokkos::View");
+  static_assert(std::is_same<typename RV::value_type,
+                             typename RV::non_const_value_type>::value,
+                "KokkosBlas::serial_nrm2: R is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  static_assert(((RV::rank == 0) && (XMV::rank == 1)) ||
+                    ((RV::rank == 1) && (XMV::rank == 2)),
+                "KokkosBlas::serial_nrm2: "
+                "RV and XMV must either have rank 0 and 1 or rank 1 and 2.");
+
+  using norm_type = typename Kokkos::Details::InnerProductSpaceTraits<
+      typename XMV::non_const_value_type>::mag_type;
+  static_assert(
+      std::is_same<typename RV::non_const_value_type, norm_type>::value,
+      "KokkosBlas::serial_nrm2: RV must have same value_type as"
+      " Kokkos::ArithTraits<XMV::value_type>::mag_type");
+
+  if (R.extent(0) != X.extent(1)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBlas::serial_nrm2 (MV): Dimensions of R and X do not match,"
+        " R: %d and X: %d x %d.\n",
+        R.extent_int(0), X.extent_int(0), X.extent_int(1));
+    return 1;
+  }
+#endif  // KOKKOSKERNELS_DEBUG_LEVEL
+
+  Impl::serial_nrm2(X.extent(0), X.extent(1), X.data(), X.stride_0(),
+                    X.stride_1(), R.data(), R.stride_0());
+  return 0;
+}
+
 }  // namespace KokkosBlas
 
 #endif  // KOKKOSBLAS1_NRM2_HPP_
diff --git a/src/blas/KokkosBlas1_scal.hpp b/src/blas/KokkosBlas1_scal.hpp
index 2fc4f92f58..d533efe535 100644
--- a/src/blas/KokkosBlas1_scal.hpp
+++ b/src/blas/KokkosBlas1_scal.hpp
@@ -46,9 +46,15 @@
 #define KOKKOSBLAS1_SCAL_HPP_
 
 #include <KokkosBlas1_scal_spec.hpp>
+#include <KokkosBlas1_serial_scal_impl.hpp>
+#include <KokkosBlas1_team_scal_impl.hpp>
 #include <KokkosKernels_helpers.hpp>
 #include <KokkosKernels_Error.hpp>
 
+///
+/// General/Host Scale
+///
+
 namespace KokkosBlas {
 
 template <class RMV, class AV, class XMV>
@@ -108,6 +114,51 @@ void scal(const RMV& R, const AV& a, const XMV& X) {
       R_internal, a_internal, X_internal);
 }
 
+///
+/// Serial Scale
+///
+
+struct SerialScale {
+  template <typename ScalarType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType& A) {
+    return Impl::SerialScaleInternal::invoke(
+        A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1());
+  }
+};
+
+///
+/// Team Scale
+///
+
+template <typename MemberType>
+struct TeamScale {
+  template <typename ScalarType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member,
+                                           const ScalarType alpha,
+                                           const AViewType& A) {
+    return Impl::TeamScaleInternal::invoke(member, A.extent(0), A.extent(1),
+                                           alpha, A.data(), A.stride_0(),
+                                           A.stride_1());
+  }
+};
+
+///
+/// TeamVector Scale
+///
+
+template <typename MemberType>
+struct TeamVectorScale {
+  template <typename ScalarType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member,
+                                           const ScalarType alpha,
+                                           const AViewType& A) {
+    return Impl::TeamVectorScaleInternal::invoke(member, A.extent(0),
+                                                 A.extent(1), alpha, A.data(),
+                                                 A.stride_0(), A.stride_1());
+  }
+};
+
 }  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/KokkosBlas1_set.hpp b/src/blas/KokkosBlas1_set.hpp
new file mode 100644
index 0000000000..61c03ec17a
--- /dev/null
+++ b/src/blas/KokkosBlas1_set.hpp
@@ -0,0 +1,99 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSBLAS1_SET_HPP_
+#define KOKKOSBLAS1_SET_HPP_
+
+#include <KokkosBlas1_set_impl.hpp>
+
+namespace KokkosBlas {
+
+///
+/// Serial Set
+///
+
+struct SerialSet {
+  template <typename ScalarType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A) {
+    return Impl::SerialSetInternal::invoke(
+        A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1());
+  }
+};
+
+///
+/// Team Set
+///
+
+template <typename MemberType>
+struct TeamSet {
+  template <typename ScalarType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A) {
+    return Impl::TeamSetInternal::invoke(member, A.extent(0), A.extent(1),
+                                         alpha, A.data(), A.stride_0(),
+                                         A.stride_1());
+  }
+};
+
+///
+/// TeamVector Set
+///
+
+template <typename MemberType>
+struct TeamVectorSet {
+  template <typename ScalarType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A) {
+    return Impl::TeamVectorSetInternal::invoke(member, A.extent(0), A.extent(1),
+                                               alpha, A.data(), A.stride_0(),
+                                               A.stride_1());
+  }
+};
+
+}  // namespace KokkosBlas
+
+#endif
diff --git a/src/blas/KokkosBlas_trtri.hpp b/src/blas/KokkosBlas_trtri.hpp
index 0402b11104..afcc05d5ae 100644
--- a/src/blas/KokkosBlas_trtri.hpp
+++ b/src/blas/KokkosBlas_trtri.hpp
@@ -129,7 +129,7 @@ int trtri(const char uplo[], const char diag[], const AViewType& A) {
 
   // This is the return value type and should always reside on host
   using RViewInternalType =
-      Kokkos::View<int, typename AViewType::array_layout, Kokkos::HostSpace,
+      Kokkos::View<int, Kokkos::LayoutRight, Kokkos::HostSpace,
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
 
   int result;
diff --git a/src/blas/impl/KokkosBlas1_dot_impl.hpp b/src/blas/impl/KokkosBlas1_dot_impl.hpp
index cb8db757f8..5430e0177b 100644
--- a/src/blas/impl/KokkosBlas1_dot_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_dot_impl.hpp
@@ -83,7 +83,7 @@ struct DotFunctor {
     Kokkos::Details::updateDot(sum, m_x(i), m_y(i));  // sum += m_x(i) * m_y(i)
   }
 
-  KOKKOS_INLINE_FUNCTION void init(volatile value_type& update) const {
+  KOKKOS_INLINE_FUNCTION void init(value_type& update) const {
     update = Kokkos::Details::ArithTraits<value_type>::zero();
   }
 
@@ -91,11 +91,6 @@ struct DotFunctor {
                                    const value_type& source) const {
     update += source;
   }
-
-  KOKKOS_INLINE_FUNCTION void join(volatile value_type& update,
-                                   const volatile value_type& source) const {
-    update += source;
-  }
 };
 
 }  // namespace Impl
diff --git a/src/blas/impl/KokkosBlas1_iamax_impl.hpp b/src/blas/impl/KokkosBlas1_iamax_impl.hpp
index dc30edf7da..8b27b3e5a3 100644
--- a/src/blas/impl/KokkosBlas1_iamax_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_iamax_impl.hpp
@@ -96,13 +96,6 @@ struct V_Iamax_Functor {
     update = Kokkos::reduction_identity<typename RV::value_type>::max() + 1;
   }
 
-  KOKKOS_INLINE_FUNCTION void join(volatile value_type& update,
-                                   const volatile value_type& source) const {
-    mag_type source_val = IPT::norm(m_x(source - 1));
-    mag_type update_val = IPT::norm(m_x(update - 1));
-    if (update_val < source_val) update = source;
-  }
-
   KOKKOS_INLINE_FUNCTION void join(value_type& update,
                                    const value_type& source) const {
     mag_type source_val = IPT::norm(m_x(source - 1));
diff --git a/src/blas/impl/KokkosBlas1_nrm2_impl.hpp b/src/blas/impl/KokkosBlas1_nrm2_impl.hpp
index f2b0e826bc..e56a884655 100644
--- a/src/blas/impl/KokkosBlas1_nrm2_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_nrm2_impl.hpp
@@ -105,11 +105,6 @@ struct V_Nrm2_Functor {
     update += source;
   }
 
-  KOKKOS_INLINE_FUNCTION void join(volatile value_type& update,
-                                   const volatile value_type& source) const {
-    update += source;
-  }
-
   KOKKOS_INLINE_FUNCTION void final(value_type& update) const {
     if (m_take_sqrt)
       update =
diff --git a/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp b/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp
index 3f202ca430..e2c858f0b3 100644
--- a/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp
@@ -108,11 +108,6 @@ struct V_Nrm2w_Functor {
     update += source;
   }
 
-  KOKKOS_INLINE_FUNCTION void join(volatile value_type& update,
-                                   const volatile value_type& source) const {
-    update += source;
-  }
-
   KOKKOS_INLINE_FUNCTION void final(value_type& update) const {
     if (m_take_sqrt)
       update =
diff --git a/src/blas/impl/KokkosBlas1_serial_scal_impl.hpp b/src/blas/impl/KokkosBlas1_serial_scal_impl.hpp
new file mode 100644
index 0000000000..bb411ef4a5
--- /dev/null
+++ b/src/blas/impl/KokkosBlas1_serial_scal_impl.hpp
@@ -0,0 +1,86 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSBLAS1_SERIAL_SCAL_IMPL_HPP_
+#define KOKKOSBLAS1_SERIAL_SCAL_IMPL_HPP_
+
+#include <Kokkos_Core.hpp>
+
+namespace KokkosBlas {
+namespace Impl {
+
+///
+/// Serial Internal Impl
+/// ====================
+struct SerialScaleInternal {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+    for (int i = 0; i < m; ++i) A[i * as0] *= alpha;
+
+    return 0;
+  }
+
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n,
+                                           const ScalarType alpha,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1) {
+    if (as0 > as1)
+      for (int i = 0; i < m; ++i) invoke(n, alpha, A + i * as0, as1);
+    else
+      for (int j = 0; j < n; ++j) invoke(m, alpha, A + j * as1, as0);
+
+    return 0;
+  }
+};
+
+}  // namespace Impl
+}  // namespace KokkosBlas
+
+#endif
diff --git a/src/batched/dense/impl/KokkosBatched_Set_Internal.hpp b/src/blas/impl/KokkosBlas1_set_impl.hpp
similarity index 66%
rename from src/batched/dense/impl/KokkosBatched_Set_Internal.hpp
rename to src/blas/impl/KokkosBlas1_set_impl.hpp
index f18ac4355c..a3870a2e15 100644
--- a/src/batched/dense/impl/KokkosBatched_Set_Internal.hpp
+++ b/src/blas/impl/KokkosBlas1_set_impl.hpp
@@ -1,11 +1,56 @@
-#ifndef __KOKKOSBATCHED_SET_INTERNAL_HPP__
-#define __KOKKOSBATCHED_SET_INTERNAL_HPP__
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef __KOKKOSBLAS_SET_IMPL_HPP__
+#define __KOKKOSBLAS_SET_IMPL_HPP__
 
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
-#include "KokkosBatched_Util.hpp"
+#include "Kokkos_Core.hpp"
 
-namespace KokkosBatched {
+namespace KokkosBlas {
+namespace Impl {
 
 ///
 /// Serial Internal Impl
@@ -115,6 +160,7 @@ struct TeamVectorSetInternal {
   }
 };
 
-}  // end namespace KokkosBatched
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Scale_Internal.hpp b/src/blas/impl/KokkosBlas1_team_scal_impl.hpp
similarity index 59%
rename from src/batched/dense/impl/KokkosBatched_Scale_Internal.hpp
rename to src/blas/impl/KokkosBlas1_team_scal_impl.hpp
index 6f313ea919..6f4fdf40b0 100644
--- a/src/batched/dense/impl/KokkosBatched_Scale_Internal.hpp
+++ b/src/blas/impl/KokkosBlas1_team_scal_impl.hpp
@@ -1,41 +1,55 @@
-#ifndef __KOKKOSBATCHED_SCALE_INTERNAL_HPP__
-#define __KOKKOSBATCHED_SCALE_INTERNAL_HPP__
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
 
-/// \author Kyungjoo Kim (kyukim@sandia.gov)
+#ifndef KOKKOSBLAS1_TEAM_SCAL_IMPL_HPP_
+#define KOKKOSBLAS1_TEAM_SCAL_IMPL_HPP_
 
-#include "KokkosBatched_Util.hpp"
+#include <Kokkos_Core.hpp>
+#include "KokkosBlas1_serial_scal_impl.hpp"
 
-namespace KokkosBatched {
-
-///
-/// Serial Internal Impl
-/// ====================
-struct SerialScaleInternal {
-  template <typename ScalarType, typename ValueType>
-  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha,
-                                           /* */ ValueType *KOKKOS_RESTRICT A,
-                                           const int as0) {
-#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-#pragma unroll
-#endif
-    for (int i = 0; i < m; ++i) A[i * as0] *= alpha;
-
-    return 0;
-  }
-
-  template <typename ScalarType, typename ValueType>
-  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n,
-                                           const ScalarType alpha,
-                                           /* */ ValueType *KOKKOS_RESTRICT A,
-                                           const int as0, const int as1) {
-    if (as0 > as1)
-      for (int i = 0; i < m; ++i) invoke(n, alpha, A + i * as0, as1);
-    else
-      for (int j = 0; j < n; ++j) invoke(m, alpha, A + j * as1, as0);
-
-    return 0;
-  }
-};
+namespace KokkosBlas {
+namespace Impl {
 
 ///
 /// Team Internal Impl
@@ -115,6 +129,7 @@ struct TeamVectorScaleInternal {
   }
 };
 
-}  // namespace KokkosBatched
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/impl/KokkosBlas2_gemv_impl.hpp b/src/blas/impl/KokkosBlas2_gemv_impl.hpp
index 6f27363be9..a6c8111684 100644
--- a/src/blas/impl/KokkosBlas2_gemv_impl.hpp
+++ b/src/blas/impl/KokkosBlas2_gemv_impl.hpp
@@ -64,8 +64,9 @@ struct SingleLevelNontransposeGEMV {
   using BetaCoeffType  = typename YViewType::non_const_value_type;
   using y_value_type   = typename YViewType::non_const_value_type;
   using AccumScalar    = typename std::conditional<
-      std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float,
-      y_value_type>::type;
+      std::is_same<y_value_type, Kokkos::Experimental::half_t>::value ||
+          std::is_same<y_value_type, Kokkos::Experimental::bhalf_t>::value,
+      float, y_value_type>::type;
 
   SingleLevelNontransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A,
                               const XViewType& x, const BetaCoeffType& beta,
@@ -146,8 +147,9 @@ struct SingleLevelTransposeGEMV {
   using AlphaCoeffType = typename AViewType::non_const_value_type;
   using BetaCoeffType  = typename YViewType::non_const_value_type;
   using AccumScalar    = typename std::conditional<
-      std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float,
-      y_value_type>::type;
+      std::is_same<y_value_type, Kokkos::Experimental::half_t>::value ||
+          std::is_same<y_value_type, Kokkos::Experimental::bhalf_t>::value,
+      float, y_value_type>::type;
 
   typedef AccumScalar value_type[];
   IndexType value_count;  // Kokkos needs this for reductions w/ array results
@@ -188,8 +190,7 @@ struct SingleLevelTransposeGEMV {
     }
   }
 
-  KOKKOS_INLINE_FUNCTION void join(volatile value_type dst,
-                                   const volatile value_type src) const {
+  KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const {
     for (IndexType j = 0; j < value_count; ++j) {
       dst[j] += src[j];
     }
@@ -479,8 +480,9 @@ struct TwoLevelGEMV {
   using AlphaCoeffType = typename AViewType::non_const_value_type;
   using BetaCoeffType  = typename YViewType::non_const_value_type;
   using AccumScalar    = typename std::conditional<
-      std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float,
-      y_value_type>::type;
+      std::is_same<y_value_type, Kokkos::Experimental::half_t>::value ||
+          std::is_same<y_value_type, Kokkos::Experimental::bhalf_t>::value,
+      float, y_value_type>::type;
 
   using execution_space = typename AViewType::execution_space;
   using policy_type     = Kokkos::TeamPolicy<execution_space>;
@@ -600,8 +602,9 @@ struct TwoLevelTransposeGEMV {
   using AlphaCoeffType = typename AViewType::non_const_value_type;
   using BetaCoeffType  = typename YViewType::non_const_value_type;
   using AccumScalar    = typename std::conditional<
-      std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float,
-      y_value_type>::type;
+      std::is_same<y_value_type, Kokkos::Experimental::half_t>::value ||
+          std::is_same<y_value_type, Kokkos::Experimental::bhalf_t>::value,
+      float, y_value_type>::type;
 
   using execution_space = typename AViewType::execution_space;
   using policy_type     = Kokkos::TeamPolicy<execution_space>;
@@ -739,7 +742,8 @@ void twoLevelGemv(const typename AViewType::execution_space& space,
     tagged_policy team;
     if (isLayoutLeft) {
       using AccumScalar = typename std::conditional<
-          std::is_same<y_value_type, Kokkos::Experimental::half_t>::value,
+          std::is_same<y_value_type, Kokkos::Experimental::half_t>::value ||
+              std::is_same<y_value_type, Kokkos::Experimental::bhalf_t>::value,
           float, y_value_type>::type;
       size_t sharedPerTeam = 32 * sizeof(AccumScalar);
       IndexType numTeams   = (A.extent(0) + 31) / 32;
diff --git a/src/blas/impl/KokkosBlas3_trmm_impl.hpp b/src/blas/impl/KokkosBlas3_trmm_impl.hpp
index 56bc2ba806..2ba3363264 100644
--- a/src/blas/impl/KokkosBlas3_trmm_impl.hpp
+++ b/src/blas/impl/KokkosBlas3_trmm_impl.hpp
@@ -53,8 +53,6 @@
 #include "KokkosKernels_config.h"
 #include "Kokkos_Core.hpp"
 #include "Kokkos_ArithTraits.hpp"
-#include "KokkosBatched_Set_Internal.hpp"
-#include "KokkosBatched_Scale_Internal.hpp"
 #include "KokkosBatched_Trmm_Decl.hpp"
 #include "KokkosBatched_Trmm_Serial_Impl.hpp"
 
diff --git a/src/blas/impl/KokkosBlas3_trsm_impl.hpp b/src/blas/impl/KokkosBlas3_trsm_impl.hpp
index b215633093..d85b850138 100644
--- a/src/blas/impl/KokkosBlas3_trsm_impl.hpp
+++ b/src/blas/impl/KokkosBlas3_trsm_impl.hpp
@@ -54,6 +54,7 @@
 #include "KokkosKernels_config.h"
 #include "Kokkos_Core.hpp"
 #include "Kokkos_ArithTraits.hpp"
+#include "KokkosBlas1_set_impl.hpp"
 #include "KokkosBatched_Trsm_Decl.hpp"
 #include "KokkosBatched_Trsm_Serial_Impl.hpp"
 
@@ -72,10 +73,10 @@ int SerialTrsmInternalLeftLowerConj(const bool use_unit_diag, const int m,
   const ScalarType one(1.0), zero(0.0);
 
   if (alpha == zero)
-    KokkosBatched::SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1);
+    SerialSetInternal::invoke(m, n, zero, B, bs0, bs1);
   else {
     if (alpha != one)
-      KokkosBatched::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
+      KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
     if (m <= 0 || n <= 0) return 0;
 
     for (int p = 0; p < m; ++p) {
@@ -111,10 +112,10 @@ int SerialTrsmInternalLeftUpperConj(const bool use_unit_diag, const int m,
   const ScalarType one(1.0), zero(0.0);
 
   if (alpha == zero)
-    KokkosBatched::SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1);
+    SerialSetInternal::invoke(m, n, zero, B, bs0, bs1);
   else {
     if (alpha != one)
-      KokkosBatched::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
+      KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
     if (m <= 0 || n <= 0) return 0;
 
     ValueType* KOKKOS_RESTRICT B0 = B;
diff --git a/src/blas/impl/KokkosBlas_Newton_impl.hpp b/src/blas/impl/KokkosBlas_Newton_impl.hpp
new file mode 100644
index 0000000000..a8a8973d41
--- /dev/null
+++ b/src/blas/impl/KokkosBlas_Newton_impl.hpp
@@ -0,0 +1,240 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Luc Berger-Vergiat (lberge@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef __KOKKOSBATCHED_ODE_NEWTON_HPP__
+#define __KOKKOSBATCHED_ODE_NEWTON_HPP__
+
+#include "Kokkos_Core.hpp"
+#include "KokkosBatched_LU_Decl.hpp"
+#include "KokkosBatched_LU_Serial_Impl.hpp"
+#include "KokkosBatched_Gesv.hpp"
+#include "KokkosBlas1_nrm2.hpp"
+#include "KokkosBlas1_scal.hpp"
+#include "KokkosBlas1_axpby.hpp"
+
+namespace KokkosBlas {
+namespace Impl {
+
+enum class NewtonSolverStatus { Converged = 0, LinearSolveFailure, MaxIters };
+
+std::ostream& operator<<(std::ostream& os, NewtonSolverStatus& status) {
+  switch (status) {
+    case NewtonSolverStatus::Converged: os << "Newton Solver Converged!"; break;
+    case NewtonSolverStatus::LinearSolveFailure:
+      os << "Newton: Linear Solver Failure";
+      break;
+    case NewtonSolverStatus::MaxIters:
+      os << "Newton reached maximum iterations without convergence.";
+      break;
+  }
+  return os;
+}
+
+/// \brief NewtonHandle
+///
+/// This handle is used to pass information between the Newton Solver and
+/// the calling code.
+///
+/// \tparam: NormViewType: Type of view used to store the residual convergence
+/// history
+
+template <class NormViewType>
+struct NewtonHandle {
+  using norm_type = typename NormViewType::non_const_value_type;
+
+  NormViewType lastResidual;  // Residual of last successful iteration
+  typename NormViewType::HostMirror lastResidualHost;
+
+  // NormViewType  residual_norms;
+  // TODO: Making these public for now. Should make private and access
+  // via setters and getters?
+  int maxIters;           // Maximum number of Newton steps
+  norm_type relativeTol;  // Relative convergence tolerance
+  bool debug_mode;        // Returns extra verbose output if true.
+
+  NewtonHandle(int _maxIters = 25, double _relativeTol = 1.0e-6,
+               bool _debug = false)
+      : lastResidual("ending Residual norm", 1),
+        lastResidualHost("end res norm host", 1),
+        maxIters(_maxIters),
+        relativeTol(_relativeTol),
+        debug_mode(_debug) {}
+
+  KOKKOS_FUNCTION
+  void set_residual(const norm_type val) const { lastResidual(0) = val; }
+
+  KOKKOS_FUNCTION
+  norm_type get_residual() const { return lastResidual(0); }
+
+  norm_type get_residual_host() const {
+    Kokkos::deep_copy(lastResidualHost, lastResidual);
+    return lastResidualHost(0);
+  }
+
+};  // NewtonHandle
+
+/// \brief Newton Functor:
+/// Solves the nonlinear system F(x) = 0
+/// where F is a map from R^n to R^n.
+/// \tparam System: Struct that allows the evaluation
+///         of the residual and jacobian using the
+///         residual() and jacobian() methods.
+/// \tparam Matrix: rank-2 view-type
+/// \tparam XVector: rank-1 view-type
+/// \tparam YVector: rank-1 view-type
+/// \param
+/// \param X [in]: Input vector X, a rank 1 view
+/// \param Y [in/out]: Output vector Y, a rank 1 view
+///
+/// No nested parallel_for is used inside of the function.
+///
+template <class System, class Matrix, class XVector, class YVector,
+          class NewtonHandleType>
+struct NewtonFunctor {
+  using execution_space = typename YVector::execution_space;
+  using yvalue_type     = typename YVector::non_const_value_type;
+  using norm_type       = typename NewtonHandleType::norm_type;
+
+  System sys;
+  XVector x;
+  YVector rhs;
+  NewtonHandleType handle;
+
+  Matrix J, tmp;
+  XVector update;
+
+  NewtonFunctor(System _sys, XVector _x, YVector _rhs,
+                NewtonHandleType& _handle)
+      : sys(_sys), x(_x), rhs(_rhs), handle(_handle) {
+    J      = Matrix("Jacobian", x.extent(0), x.extent(0));
+    tmp    = Matrix("Jacobian", x.extent(0), x.extent(0) + 4);
+    update = XVector("update", x.extent(0));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  NewtonSolverStatus solve() const {
+    norm_type norm    = Kokkos::ArithTraits<norm_type>::zero();
+    yvalue_type alpha = Kokkos::ArithTraits<yvalue_type>::one();
+    handle.set_residual(-1);  // init to dummy value
+
+    // Iterate until maxIts or the tolerance is reached
+    for (int it = 0; it < handle.maxIters; ++it) {
+      // compute initial rhs
+      sys.residual(x, rhs);
+      if (handle.debug_mode) {
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF("NewtonFunctor: r=");
+        for (int k = 0; k < rhs.extent_int(0); k++) {
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", rhs(k));
+        }
+      }
+
+      // Solve the following linearized
+      // problem at each step: J*update=-rhs
+      // with J=du/dx, rhs=f(u_n+update)-f(u_n)
+      norm = KokkosBlas::serial_nrm2(rhs);
+      handle.set_residual(norm);
+
+      if (handle.debug_mode) {
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+            "NewtonFunctor: Iteration: %d  Current res norm is: %e \n Current "
+            "soln is:\n",
+            it, (double)handle.get_residual());
+        for (int k = 0; k < x.extent_int(0); k++) {
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", x(k));
+        }
+      }
+
+      if (norm < handle.relativeTol) {
+        // Problem solved, exit the functor
+        if (handle.debug_mode) {
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+              "NewtonFunctor: Newton solver converged! Ending norm is: %e \n "
+              "Solution x is: "
+              "\n",
+              norm);
+          for (int k = 0; k < x.extent_int(0); k++) {
+            KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", x(k));
+          }
+        }
+        return NewtonSolverStatus::Converged;
+      }
+
+      // compute LHS
+      sys.jacobian(x, J);
+
+      // solve linear problem
+      int linSolverStat = KokkosBatched::SerialGesv<
+          KokkosBatched::Gesv::StaticPivoting>::invoke(J, update, rhs, tmp);
+      KokkosBlas::SerialScale::invoke(-1, update);
+
+      if (handle.debug_mode) {
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+            "NewtonFunctor: Print linear solve solution: \n");
+        for (int k = 0; k < update.extent_int(0); k++) {
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", update(k));
+        }
+      }
+      if (linSolverStat == 1) {
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+            "NewtonFunctor: Linear solve gesv returned failure! \n");
+        return NewtonSolverStatus::LinearSolveFailure;
+      }
+
+      // update solution // x = x + alpha*update
+      KokkosBlas::serial_axpy(alpha, update, x);
+      if (handle.debug_mode) {
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+            "NewtonFunctor: Print updated solution: \n");
+        for (int k = 0; k < x.extent_int(0); k++) {
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", x(k));
+        }
+      }
+    }
+    return NewtonSolverStatus::MaxIters;
+  }  // End solve functor.
+};
+
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // __KOKKOSBATCHED_ODE_NEWTON_HPP__
diff --git a/src/blas/impl/KokkosBlas_serial_axpy.hpp b/src/blas/impl/KokkosBlas_serial_axpy.hpp
new file mode 100644
index 0000000000..f9cc918650
--- /dev/null
+++ b/src/blas/impl/KokkosBlas_serial_axpy.hpp
@@ -0,0 +1,88 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Luc Berger-Vergiat (lberge@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSBLAS_SERIAL_AXPY_IMPL_HPP_
+#define KOKKOSBLAS_SERIAL_AXPY_IMPL_HPP_
+
+#include <Kokkos_Core.hpp>
+
+namespace KokkosBlas {
+namespace Impl {
+
+///
+/// Serial Internal Impl
+/// ====================
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION static void serial_axpy(
+    const int m, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT X,
+    /* */ ValueType *KOKKOS_RESTRICT Y, const int xs0, const int ys0) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+  for (int i = 0; i < m; ++i) Y[i * ys0] += alpha * X[i * xs0];
+
+  return;
+}
+
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION static void serial_axpy_mv(
+    const int m, const int n, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT X,
+    /* */ ValueType *KOKKOS_RESTRICT Y, const int xs0, const int xs1,
+    const int ys0, const int ys1) {
+  if (xs0 > xs1) {
+    for (int i = 0; i < m; ++i)
+      serial_axpy(n, alpha, X + i * xs0, Y + i * ys0, xs1, ys1);
+  } else {
+    for (int j = 0; j < n; ++j)
+      serial_axpy(m, alpha, X + j * xs1, Y + j * ys1, xs0, ys0);
+  }
+
+  return;
+}
+
+}  // namespace Impl
+}  // namespace KokkosBlas
+
+#endif
diff --git a/src/blas/impl/KokkosBlas_serial_nrm2.hpp b/src/blas/impl/KokkosBlas_serial_nrm2.hpp
new file mode 100644
index 0000000000..9397dc5020
--- /dev/null
+++ b/src/blas/impl/KokkosBlas_serial_nrm2.hpp
@@ -0,0 +1,92 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSBLAS_SERIAL_NRM2_HPP_
+#define KOKKOSBLAS_SERIAL_NRM2_HPP_
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_InnerProductSpaceTraits.hpp>
+
+namespace KokkosBlas {
+namespace Impl {
+
+///
+/// Serial Internal Impl
+/// ====================
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION static
+    typename Kokkos::Details::InnerProductSpaceTraits<ValueType>::mag_type
+    serial_nrm2(const int m, const ValueType *KOKKOS_RESTRICT X,
+                const int xs0) {
+  using IPT       = Kokkos::Details::InnerProductSpaceTraits<ValueType>;
+  using norm_type = typename IPT::mag_type;
+
+  norm_type nrm = Kokkos::ArithTraits<norm_type>::zero();
+
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+  for (int i = 0; i < m; ++i)
+    nrm += IPT::norm(IPT::dot(X[i * xs0], X[i * xs0]));
+
+  return Kokkos::ArithTraits<norm_type>::sqrt(nrm);
+}
+
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION static void serial_nrm2(
+    const int m, const int n, const ValueType *KOKKOS_RESTRICT X, const int xs0,
+    const int xs1,
+    typename Kokkos::Details::InnerProductSpaceTraits<ValueType>::mag_type
+        *KOKKOS_RESTRICT R,
+    const int ys0) {
+  for (int vecIdx = 0; vecIdx < n; ++vecIdx)
+    R[vecIdx * ys0] = serial_nrm2(m, X + vecIdx * xs1, xs0);
+
+  return;
+}
+
+}  // namespace Impl
+}  // namespace KokkosBlas
+
+#endif  // KOKKOSBLAS_SERIAL_NRM2_HPP_
diff --git a/src/blas/impl/KokkosBlas_trtri_spec.hpp b/src/blas/impl/KokkosBlas_trtri_spec.hpp
index 1cccad1ea4..0bbeb294dc 100644
--- a/src/blas/impl/KokkosBlas_trtri_spec.hpp
+++ b/src/blas/impl/KokkosBlas_trtri_spec.hpp
@@ -69,7 +69,7 @@ struct trtri_eti_spec_avail {
                                         MEM_SPACE)                           \
   template <>                                                                \
   struct trtri_eti_spec_avail<                                               \
-      Kokkos::View<int, LAYOUTA, Kokkos::HostSpace,                          \
+      Kokkos::View<int, Kokkos::LayoutRight, Kokkos::HostSpace,              \
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
       Kokkos::View<SCALAR**, LAYOUTA, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {             \
@@ -136,7 +136,7 @@ struct TRTRI<RVIT, AVIT, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
 //
 #define KOKKOSBLAS_TRTRI_ETI_SPEC_DECL(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \
   extern template struct TRTRI<                                                \
-      Kokkos::View<int, LAYOUTA, Kokkos::HostSpace,                            \
+      Kokkos::View<int, Kokkos::LayoutRight, Kokkos::HostSpace,                \
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
       Kokkos::View<SCALAR**, LAYOUTA, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,   \
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
@@ -144,7 +144,7 @@ struct TRTRI<RVIT, AVIT, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
 
 #define KOKKOSBLAS_TRTRI_ETI_SPEC_INST(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \
   template struct TRTRI<                                                       \
-      Kokkos::View<int, LAYOUTA, Kokkos::HostSpace,                            \
+      Kokkos::View<int, Kokkos::LayoutRight, Kokkos::HostSpace,                \
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
       Kokkos::View<SCALAR**, LAYOUTA, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,   \
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
diff --git a/src/kokkoskernels_eti.cmake b/src/cmake/kokkoskernels_eti.cmake
similarity index 100%
rename from src/kokkoskernels_eti.cmake
rename to src/cmake/kokkoskernels_eti.cmake
diff --git a/src/common/KokkosKernels_BlockHashmapAccumulator.hpp b/src/common/KokkosKernels_BlockHashmapAccumulator.hpp
new file mode 100644
index 0000000000..576060cf75
--- /dev/null
+++ b/src/common/KokkosKernels_BlockHashmapAccumulator.hpp
@@ -0,0 +1,660 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef _KOKKOSKERNELS_BLOCKHASHMAPACCUMULATOR_HPP
+#define _KOKKOSKERNELS_BLOCKHASHMAPACCUMULATOR_HPP
+#include <Kokkos_Atomic.hpp>
+#include <atomic>
+#include "KokkosKernels_BlockUtils.hpp"
+#include "KokkosKernels_HashmapAccumulator.hpp"
+
+//#define HASHMAPACCUMULATOR_ASSERT_ENABLED
+
+namespace KokkosKernels {
+
+namespace Experimental {
+
+template <typename size_type, typename key_type, typename value_type,
+          typename hash_type>
+/**
+ * \brief BlockHashmapAccumulator class
+ * The use of this is described in the paper:
+ *   "Performance-portable sparse matrix-matrix multiplication for many-core
+ * architectures" ( https://ieeexplore.ieee.org/abstract/document/7965111/ ) in
+ * section III.D
+ *
+ * Public members:
+ * \var hash_begins: Holds the beginning indices of the linked lists
+ *                   corresponding to hash values [Begins]
+ * \var hash_nexts:  Holds the indicies of the next elements
+ *                   within the linked list [Nexts]
+ * \var keys:        This stores the column indices of the crs matrix [Ids]
+ * \var values:      This store the numerical values (matrix elements) [Values]
+ *
+ * Private members:
+ * \var __max_value_size: The length of the two arrays (keys and hash_nexts)
+ * \var __hashOpRHS:      The right hand side of the requested hash operation.
+ * \var __insert_success: Value to return upon insertion success.
+ * \var __insert_full:    Value to return upon insertion failure.
+ */
+struct BlockHashmapAccumulator {
+  // begin public members
+  // issue-508, TODO: It's best for used_size to be an internal member of this
+  // class but the current use-cases rely on used_size to be a parameter to the
+  // below insertion routines. One way to remove used_size as a parameter to the
+  // insertion routines is to instantiate multiple BlockHashmapAccumulator
+  // objects (one hashmap for each team of threads) instead of using a single
+  // BlockHashmapAccumulator object for multiple teams of threads; this entails
+  // major refactoring throughout the kokkos-kernels code base.
+  // Making used_size a pointer and private member of this
+  // class still exposes access to this member outside of the class and is
+  // not a good option.
+  // size_type used_size;
+
+  // issue-508, TODO: The hash_begins, hash_nexts, keys, values,
+  // __insert_success, and __insert_full members should all be private as well.
+  // They should be managed solely by this BlockHashmapAccumulator class:
+  // initialized in the constructor(s) and only managed by
+  // BlockHashmapAccumulator insertion routines. Making these members private
+  // requires major refactoring throughout the kokkos-kernels code base. If
+  // allocations for these members must really live outside this class, we need
+  // new members that break
+  // __max_value_size into: hash_begins_len, hash_nexts_len, keys_len, and
+  // values_len...!
+
+  size_type *hash_begins;
+  size_type *hash_nexts;
+  key_type *keys;
+  value_type *values;
+  const size_type block_dim;
+  const size_type block_size;
+
+  /**
+   * \brief default constructor BlockHashmapAccumulator
+   * Sets used_size to 0, __insert_success to 0, __insert_full to 1, and
+   * __hashOpRHS to 0.
+   *
+   * Assumption: hash_begins_ are all initialized to -1.
+   */
+  KOKKOS_INLINE_FUNCTION
+  BlockHashmapAccumulator()
+      : hash_begins(),
+        hash_nexts(),
+        keys(),
+        values(),
+        __max_value_size(),
+        __hashOpRHS(0) {}
+
+  /**
+   * \brief parameterized constructor BlockHashmapAccumulator
+   * Sets used_size to 0, __insert_success to 0, and __insert_full to 1.
+   *
+   * \param max_value_size_: The length of the two arrays (keys and hash_nexts)
+   * \param hashOpRHS:       The right hand side of the requested hash
+   * operation. \param hash_begins_:    Holds the beginning indices of the
+   * linked lists corresponding to hash values [Begins] \param hash_nexts_:
+   * Holds the indicies of the next elements within the linked list [Nexts]
+   * \param keys_:           This stores the column indices of (??) [Ids]
+   * \param values_:         This store the (matrix element?) numerical value of
+   * (??) [Values]
+   *
+   * Assumption: hash_begins_ are all initialized to -1.
+   */
+  KOKKOS_INLINE_FUNCTION
+  BlockHashmapAccumulator(size_type block_dim_, const size_type max_value_size_,
+                          const size_type hashOpRHS, size_type *hash_begins_,
+                          size_type *hash_nexts_, key_type *keys_,
+                          value_type *values_)
+      : hash_begins(hash_begins_),
+        hash_nexts(hash_nexts_),
+        keys(keys_),
+        values(values_),
+        block_dim(block_dim_),
+        block_size(block_dim_ * block_dim_),
+        __max_value_size(max_value_size_),
+        __hashOpRHS(hashOpRHS) {
+    // Substract 1 and use the bitwiseAnd __compute_hash member.
+    if (std::is_same<hash_type, HashOpType::pow2Modulo>::value) {
+      __hashOpRHS -= 1;
+    }
+  }
+
+  // Performs C[hash] += A * B (for existing entry)
+  //       or C[hash]  = A * B (for new entry)
+  // Insertion is sequential, no race condition for the insertion.
+  // the mergeadd used in the numeric of KKMEM.
+  KOKKOS_INLINE_FUNCTION
+  void sequential_insert_into_hash_mergeAdd_TrackHashes(
+      key_type key, const value_type *valueA, const value_type *valueB,
+      size_type *used_size_, size_type *used_hash_size,
+      size_type *used_hashes) {
+    size_type hash, i, my_index;
+
+    if (key == -1) return;
+
+    // issue-508, TODO: ensure that i < __max_value_size, but
+    // need information about length of keys, values, and hash_nexts first!
+    hash = __compute_hash(key, __hashOpRHS);
+    for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) {
+      if (keys[i] == key) {
+        KokkosSparse::Impl::kk_block_add_mul(block_dim, values + i * block_size,
+                                             valueA, valueB);
+        return;
+      }
+    }
+
+    my_index = (*used_size_)++;
+
+    if (hash_begins[hash] == -1) {
+      used_hashes[used_hash_size[0]++] = hash;
+    }
+    hash_nexts[my_index] = hash_begins[hash];
+
+    hash_begins[hash] = my_index;
+    keys[my_index]    = key;
+    KokkosSparse::Impl::kk_block_set_mul(
+        block_dim, values + my_index * block_size, valueA, valueB);
+  }
+
+  // Performs C[hash] += A * B (for existing entry)
+  //       or C[hash]  = A * B (for new entry)
+  // Insertion is sequential, no race condition for the insertion.
+  // the mergeadd used in the numeric of KKMEM.
+  KOKKOS_INLINE_FUNCTION
+  void sequential_insert_into_hash_simple(key_type key, const value_type *a_val,
+                                          const value_type *b_val,
+                                          size_type &used_size,
+                                          size_type *used_hashes) {
+    for (size_type hash = (key * HASHSCALAR) & __hashOpRHS;;
+         hash           = (hash + 1) & __hashOpRHS) {
+      if (keys[hash] == -1) {
+        used_hashes[used_size++] = hash;
+        keys[hash]               = key;
+        KokkosSparse::Impl::kk_block_set_mul(
+            block_dim, values + hash * block_size, a_val, b_val);
+        break;
+      } else if (keys[hash] == key) {
+        KokkosSparse::Impl::kk_block_add_mul(
+            block_dim, values + hash * block_size, a_val, b_val);
+        break;
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void sequential_export_values_simple(const size_type used_size,
+                                       const size_type *used_hashes,
+                                       key_type *out_keys,
+                                       value_type *out_values,
+                                       const bool clear = true) {
+    for (size_type i = 0; i < used_size; ++i) {
+      const auto hash = used_hashes[i];
+      out_keys[i]     = keys[hash];
+      KokkosSparse::Impl::kk_block_set(block_dim, out_values + i * block_size,
+                                       values + hash * block_size);
+      if (clear) {
+        keys[hash] = -1;
+      }
+    }
+  }
+
+  // used in the kkmem's numeric phase for second level hashmaps.
+  // function to be called from device.
+  // Accumulation is Add operation. It is not atomicAdd, as this
+  // is for the cases where we know that none of the simultanous
+  // insertions will have the same key.
+  // Insertion is simulteanous for the vector lanes of a thread.
+  // used_size should be a shared pointer among the thread vectors
+  KOKKOS_INLINE_FUNCTION
+  int vector_atomic_insert_into_hash_mergeAdd_TrackHashes(
+      const key_type key, const value_type *valA, const value_type *valB,
+      volatile size_type *used_size_, size_type *used_hash_size,
+      size_type *used_hashes) {
+    size_type hash, i, my_write_index, hashbeginning;
+
+    if (key == -1) return __insert_success;
+
+    hash = __compute_hash(key, __hashOpRHS);
+    if (hash != -1) {
+      i = hash_begins[hash];
+
+      for (; i != -1; i = hash_nexts[i]) {
+        if (keys[i] == key) {
+          KokkosSparse::Impl::kk_block_add_mul(
+              block_dim, values + i * block_size, valA, valB);
+          return __insert_success;
+        }
+      }
+    } else {
+      return __insert_success;
+    }
+
+    my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1));
+
+    if (my_write_index >= __max_value_size) {
+      return __insert_full;
+    } else {
+      keys[my_write_index] = key;
+      KokkosSparse::Impl::kk_block_set_mul(
+          block_dim, values + my_write_index * block_size, valA, valB);
+
+#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \
+    defined(KOKKOS_ARCH_AMPERE)
+      // this is an issue on VOLTA and up because warps do not go in SIMD
+      // fashion anymore. while some thread might insert my_write_index into
+      // linked list, another thread in the warp might be reading keys in above
+      // loop. before inserting the new value in liked list -- which is done
+      // with atomic exchange below, we make sure that the linked is is complete
+      // my assigning the hash_next to current head. the head might be different
+      // when we do the atomic exchange. this would cause temporarily skipping a
+      // key in the linkedlist until hash_nexts is updated second time as below.
+      // but this is okay for spgemm,
+      // because no two keys will be inserted into hashmap at the same time, as
+      // rows have unique columns.
+
+      // Neither the compiler nor the execution unit can re-order the line
+      // directly below with the next line performing the atomic_exchange as the
+      // atomic exchange writes to hash_begins[hash] and this line reads from
+      // hash_begins[hash].
+      // This line is needed such that threads of execution can still access the
+      // old linked list, after hash_begins+hash has been atomically overwritten
+      // with my_write_index but before hash_nexts[my_write_index] is
+      // overwritten with hashbeginning. If this line was not here, threads may
+      // not be able to access the dangling linked list since
+      // hash_nexts[my_write_index] would still be -1.
+      hash_nexts[my_write_index] = hash_begins[hash];
+#endif
+
+      hashbeginning =
+          Kokkos::atomic_exchange(hash_begins + hash, my_write_index);
+      if (hashbeginning == -1) {
+        used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] =
+            hash;
+      }
+      hash_nexts[my_write_index] = hashbeginning;
+      return __insert_success;
+    }
+  }
+
+  template <typename team_member_t>
+  KOKKOS_INLINE_FUNCTION int
+  vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length(
+      const team_member_t & /* teamMember */, const int /* vector_size */,
+      size_type hash, const key_type key, const value_type *valA,
+      const value_type *valB, volatile size_type *used_size_,
+      const size_type max_value_size_) {
+    // Cannot compute hash here due to impl_speed use-case
+    // hash = __compute_hash(key, __hashOpRHS);
+    if (key == -1) return __insert_success;
+
+    if (hash != -1) {
+      size_type i = hash_begins[hash];
+      for (; i != -1; i = hash_nexts[i]) {
+        if (keys[i] == key) {
+          KokkosSparse::Impl::kk_block_add_mul(
+              block_dim, values + i * block_size, valA, valB);
+          return __insert_success;
+        }
+      }
+    } else {
+      return __insert_success;
+    }
+
+    // Ensure that threads don't continue incrementing used_size_ if the hashmap
+    // is full, used_size_ could overflow and result in undefined behavior.
+    if (used_size_[0] >= max_value_size_) {
+      return __insert_full;
+    }
+    size_type my_write_index =
+        Kokkos::atomic_fetch_add(used_size_, size_type(1));
+
+    if (my_write_index >= max_value_size_) {
+      return __insert_full;
+    } else {
+      keys[my_write_index] = key;
+      KokkosSparse::Impl::kk_block_set_mul(
+          block_dim, values + my_write_index * block_size, valA, valB);
+
+#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \
+    defined(KOKKOS_ARCH_AMPERE)
+      // this is an issue on VOLTA and up because warps do not go in SIMD
+      // fashion anymore. while some thread might insert my_write_index into
+      // linked list, another thread in the warp might be reading keys in above
+      // loop. before inserting the new value in liked list -- which is done
+      // with atomic exchange below, we make sure that the linked is is complete
+      // my assigning the hash_next to current head. the head might be different
+      // when we do the atomic exchange. this would cause temporarily skipping a
+      // key in the linkedlist until hash_nexts is updated second time as below.
+      // but this is okay for spgemm,
+      // because no two keys will be inserted into hashmap at the same time, as
+      // rows have unique columns.
+
+      // Neither the compiler nor the execution unit can re-order the line
+      // directly below with the next line performing the atomic_exchange as the
+      // atomic exchange writes to hash_begins[hash] and this line reads from
+      // hash_begins[hash].
+      // This line is needed such that threads of execution can still access the
+      // old linked list, after hash_begins+hash has been atomically overwritten
+      // with my_write_index but before hash_nexts[my_write_index] is
+      // overwritten with hashbeginning. If this line was not here, threads may
+      // not be able to access the dangling linked list since
+      // hash_nexts[my_write_index] would still be -1.
+      hash_nexts[my_write_index] = hash_begins[hash];
+#endif
+
+      // Atomically:
+      // hashbeginning = hash_begins[hash]
+      // hash_begins[hash] = my_write_index
+      // hash_nexts[my_write_index] = hash_begins[hash]
+      size_type hashbeginning =
+          Kokkos::atomic_exchange(hash_begins + hash, my_write_index);
+      hash_nexts[my_write_index] = hashbeginning;
+      return __insert_success;
+    }
+  }
+
+  // used in kkmem's numeric phase to insert to first level hashmaps.
+  // function to be called from device.
+  // Accumulation is Add operation. It is not atomicAdd, as this
+  // is for the cases where we know that none of the simultanous
+  // insertions will have the same key.
+  // Insertion is simulteanous for the vector lanes of a thread.
+  // used_size should be a shared pointer among the thread vectors
+  KOKKOS_INLINE_FUNCTION
+  int vector_atomic_insert_into_hash_mergeAdd(const key_type key,
+                                              const value_type *valA,
+                                              const value_type *valB,
+                                              volatile size_type *used_size_) {
+    if (key == -1) return __insert_success;
+
+    return vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length(
+        nullptr, 0, __compute_hash(key, __hashOpRHS), key, valA, valB,
+        used_size_, __max_value_size);
+  }
+
+#if 0
+  // used in symbolic of kkmem if the compression is not applied.
+  KOKKOS_INLINE_FUNCTION
+  int vector_atomic_insert_into_hash(const key_type &key,
+                                     volatile size_type *used_size_) {
+    size_type hash, i, my_write_index, hashbeginning;
+
+    if (key == -1) return __insert_success;
+
+    hash = __compute_hash(key, __hashOpRHS);
+    for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) {
+      if (keys[i] == key) {
+        return __insert_success;
+      }
+    }
+
+    my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1));
+
+    if (my_write_index >= __max_value_size) {
+      return __insert_full;
+    } else {
+      keys[my_write_index] = key;
+
+#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \
+    defined(KOKKOS_ARCH_AMPERE)
+      // this is an issue on VOLTA and up because warps do not go in SIMD
+      // fashion anymore. while some thread might insert my_write_index into
+      // linked list, another thread in the warp might be reading keys in above
+      // loop. before inserting the new value in liked list -- which is done
+      // with atomic exchange below, we make sure that the linked is is complete
+      // my assigning the hash_next to current head. the head might be different
+      // when we do the atomic exchange. this would cause temporarily skipping a
+      // key in the linkedlist until hash_nexts is updated second time as below.
+      // but this is okay for spgemm,
+      // because no two keys will be inserted into hashmap at the same time, as
+      // rows have unique columns.
+      hash_nexts[my_write_index] = hash_begins[hash];
+#endif
+
+      hashbeginning =
+          Kokkos::atomic_exchange(hash_begins + hash, my_write_index);
+      hash_nexts[my_write_index] = hashbeginning;
+      return __insert_success;
+    }
+  }
+
+  // function to be called from device.
+  // Accumulation is Add operation. It is not atomicAdd, as this
+  // is for the cases where we know that none of the simultanous
+  // insertions will have the same key.
+  // Insertion is simulteanous for the vector lanes of a thread.
+  // used_size should be a shared pointer among the thread vectors
+  KOKKOS_INLINE_FUNCTION
+  int vector_atomic_insert_into_hash_mergeOr(const key_type &key,
+                                             const value_type &value,
+                                             volatile size_type *used_size_) {
+    size_type hash, i, my_write_index, hashbeginning;
+
+    if (key == -1) return __insert_success;
+
+    hash = __compute_hash(key, __hashOpRHS);
+    for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) {
+      if (keys[i] == key) {
+        values[i] = values[i] | value;
+        return __insert_success;
+      }
+    }
+
+    my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1));
+
+    if (my_write_index >= __max_value_size) {
+      return __insert_full;
+    } else {
+      keys[my_write_index]   = key;
+      values[my_write_index] = value;
+
+#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \
+    defined(KOKKOS_ARCH_AMPERE)
+      // this is an issue on VOLTA and up because warps do not go in SIMD
+      // fashion anymore. while some thread might insert my_write_index into
+      // linked list, another thread in the warp might be reading keys in above
+      // loop. before inserting the new value in liked list -- which is done
+      // with atomic exchange below, we make sure that the linked is is complete
+      // my assigning the hash_next to current head. the head might be different
+      // when we do the atomic exchange. this would cause temporarily skipping a
+      // key in the linkedlist until hash_nexts is updated second time as below.
+      // but this is okay for spgemm,
+      // because no two keys will be inserted into hashmap at the same time, as
+      // rows have unique columns.
+      hash_nexts[my_write_index] = hash_begins[hash];
+#endif
+
+      hashbeginning =
+          Kokkos::atomic_exchange(hash_begins + hash, my_write_index);
+      hash_nexts[my_write_index] = hashbeginning;
+      return __insert_success;
+    }
+  }
+
+  // function to be called from device.
+  // Accumulation is Add operation. It is not atomicAdd, as this
+  // is for the cases where we know that none of the simultanous
+  // insertions will have the same key.
+  // Insertion is simulteanous for the vector lanes of a thread.
+  // used_size should be a shared pointer among the thread vectors
+  KOKKOS_INLINE_FUNCTION
+  int vector_atomic_insert_into_hash_mergeOr_TrackHashes(
+      const key_type &key, const value_type &value,
+      volatile size_type *used_size_, size_type *used_hash_size,
+      size_type *used_hashes) {
+    size_type hash, i, my_write_index, hashbeginning;
+
+    if (key == -1) return __insert_success;
+
+    hash = __compute_hash(key, __hashOpRHS);
+    for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) {
+      if (keys[i] == key) {
+        values[i] = values[i] | value;
+        return __insert_success;
+      }
+    }
+
+    my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1));
+
+    if (my_write_index >= __max_value_size) {
+      return __insert_full;
+    } else {
+      keys[my_write_index]   = key;
+      values[my_write_index] = value;
+
+#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \
+    defined(KOKKOS_ARCH_AMPERE)
+      // this is an issue on VOLTA and up because warps do not go in SIMD
+      // fashion anymore. while some thread might insert my_write_index into
+      // linked list, another thread in the warp might be reading keys in above
+      // loop. before inserting the new value in liked list -- which is done
+      // with atomic exchange below, we make sure that the linked is is complete
+      // my assigning the hash_next to current head. the head might be different
+      // when we do the atomic exchange. this would cause temporarily skipping a
+      // key in the linkedlist until hash_nexts is updated second time as below.
+      // but this is okay for spgemm,
+      // because no two keys will be inserted into hashmap at the same time, as
+      // rows have unique columns.
+      hash_nexts[my_write_index] = hash_begins[hash];
+#endif
+
+      hashbeginning =
+          Kokkos::atomic_exchange(hash_begins + hash, my_write_index);
+      if (hashbeginning == -1) {
+        used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] =
+            hash;
+      }
+      hash_nexts[my_write_index] = hashbeginning;
+      return __insert_success;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int vector_atomic_insert_into_hash_TrackHashes(const key_type &key,
+                                                 volatile size_type *used_size_,
+                                                 size_type *used_hash_size,
+                                                 size_type *used_hashes) {
+    size_type hash, i, my_write_index, hashbeginning;
+
+    if (key == -1) return __insert_success;
+
+    hash = __compute_hash(key, __hashOpRHS);
+    for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) {
+      if (keys[i] == key) {
+        return __insert_success;
+      }
+    }
+
+    my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1));
+
+    if (my_write_index >= __max_value_size) {
+      return __insert_full;
+    } else {
+      keys[my_write_index] = key;
+
+#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \
+    defined(KOKKOS_ARCH_AMPERE)
+      // this is an issue on VOLTA and up because warps do not go in SIMD
+      // fashion anymore. while some thread might insert my_write_index into
+      // linked list, another thread in the warp might be reading keys in above
+      // loop. before inserting the new value in liked list -- which is done
+      // with atomic exchange below, we make sure that the linked is is complete
+      // my assigning the hash_next to current head. the head might be different
+      // when we do the atomic exchange. this would cause temporarily skipping a
+      // key in the linkedlist until hash_nexts is updated second time as below.
+      // but this is okay for spgemm,
+      // because no two keys will be inserted into hashmap at the same time, as
+      // rows have unique columns.
+      hash_nexts[my_write_index] = hash_begins[hash];
+#endif
+
+      hashbeginning =
+          Kokkos::atomic_exchange(hash_begins + hash, my_write_index);
+      if (hashbeginning == -1) {
+        used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] =
+            hash;
+      }
+      hash_nexts[my_write_index] = hashbeginning;
+      return __insert_success;
+    }
+  }
+#endif
+  // end public members
+ private:
+  size_type __max_value_size;
+  size_type __hashOpRHS;
+  static constexpr int __insert_success = 0;
+  static constexpr int __insert_full    = 1;
+
+  template <typename U = hash_type,
+            typename std::enable_if<
+                std::is_same<U, HashOpType::bitwiseAnd>::value ||
+                    std::is_same<U, HashOpType::pow2Modulo>::value,
+                std::size_t>::type = 0>
+  KOKKOS_INLINE_FUNCTION int __compute_hash(size_type key, size_type bitmask) {
+    size_type hash = key & bitmask;
+#ifdef HASHMAPACCUMULATOR_ASSERT_ENABLED
+    if (hash == -1) Kokkos::abort("__compute_hash: hash = -1");
+    if (key == -1) Kokkos::abort("__compute_hash: key = -1");
+#endif  // HASHMAPACCUMULATOR_ASSERT_ENABLED
+    return hash;
+  }
+
+  template <typename U                                 = hash_type,
+            typename std::enable_if<std::is_same<U, HashOpType::modulo>::value,
+                                    std::size_t>::type = 0>
+  KOKKOS_INLINE_FUNCTION int __compute_hash(size_type key, size_type divisor) {
+    size_type hash = key % divisor;
+#ifdef HASHMAPACCUMULATOR_ASSERT_ENABLED
+    if (hash == -1) Kokkos::abort("__compute_hash: hash = -1");
+    if (key == -1) Kokkos::abort("__compute_hash: key = -1");
+#endif  // HASHMAPACCUMULATOR_ASSERT_ENABLED
+    return hash;
+  }
+  // private
+};  // struct BlockHashmapAccumulator
+
+}  // namespace Experimental
+}  // namespace KokkosKernels
+
+#endif  //  _KOKKOSKERNELS_HASHMAPACCUMULATOR_HPP
diff --git a/src/common/KokkosKernels_BlockUtils.hpp b/src/common/KokkosKernels_BlockUtils.hpp
new file mode 100644
index 0000000000..0c001ce115
--- /dev/null
+++ b/src/common/KokkosKernels_BlockUtils.hpp
@@ -0,0 +1,145 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef _KOKKOSKERNELS_BLOCKUTILS_HPP
+#define _KOKKOSKERNELS_BLOCKUTILS_HPP
+
+// #include <Kokkos_Atomic.hpp>
+// #include <atomic>
+#include "KokkosBatched_Gemm_Serial_Internal.hpp"
+
+namespace KokkosSparse {
+namespace Impl {
+
+// Initializes block: A = [val, val, val, ....]
+template <typename size_type, typename value_type>
+KOKKOS_INLINE_FUNCTION void kk_block_init(
+    const size_type block_dim, value_type *dst,
+    const value_type val = static_cast<value_type>(
+        0)) {  // Note: replaces __host__ std::fill() not to be called from GPU
+  for (auto end = dst + (block_dim * block_dim); dst < end; ++dst) {
+    *dst = val;
+  }
+}
+
+// Initializes block: A = B
+template <typename size_type, typename value_type>
+KOKKOS_INLINE_FUNCTION void kk_block_set(const size_type block_dim,
+                                         value_type *dst,
+                                         const value_type *val) {
+  memcpy(dst, val, block_dim * block_dim * sizeof(value_type));
+}
+
+// Performs A += B on blocks
+template <typename size_type, typename value_type>
+KOKKOS_INLINE_FUNCTION void kk_block_add(const size_type block_dim,
+                                         value_type *dst,
+                                         const value_type *val) {
+  const auto end = dst + block_dim * block_dim;
+  while (dst < end) {
+    *(dst++) += *(val++);
+  }
+}
+
+// Performs C += A * B on blocks
+// Note: block is assumed to be row-major, dense matrix (no extra padding)
+// Note: set clear=true to set C = 0 before increment
+template <typename size_type, typename value_type,
+          typename DGEMM = KokkosBatched::SerialGemmInternal<
+              KokkosBatched::Algo::Gemm::Unblocked>>
+KOKKOS_INLINE_FUNCTION void kk_block_dgemm(const size_type block_dim,
+                                           value_type *dst,
+                                           const value_type *valA,
+                                           const value_type *valB,
+                                           const bool clear = false) {
+  const auto ZERO = static_cast<value_type>(0);
+  const auto ONE  = static_cast<value_type>(1);
+  DGEMM::invoke(block_dim, block_dim, block_dim, ONE, valA, block_dim, 1, valB,
+                block_dim, 1, clear ? ZERO : ONE, dst, block_dim, 1);
+}
+
+// dgemm: C = A * B
+template <typename size_type, typename value_type>
+KOKKOS_INLINE_FUNCTION void kk_block_set_mul(const size_type block_dim,
+                                             value_type *c_val,
+                                             const value_type *a_val,
+                                             const value_type *b_val) {
+  kk_block_dgemm(block_dim, c_val, a_val, b_val, true);
+}
+
+// dgemm: C += A * B
+template <typename size_type, typename value_type>
+KOKKOS_INLINE_FUNCTION void kk_block_add_mul(const size_type block_dim,
+                                             value_type *c_val,
+                                             const value_type *a_val,
+                                             const value_type *b_val) {
+  kk_block_dgemm(block_dim, c_val, a_val, b_val, false);
+}
+
+// Performs C += A * B (dense GEMM) on blocks
+// Note: all pointers reference dense row-major blocks (no extra padding)
+template <typename size_type, typename value_type>
+KOKKOS_INLINE_FUNCTION void kk_vector_block_add_mul(const size_type block_dim,
+                                                    value_type *dst,
+                                                    const value_type *valA,
+                                                    const value_type *valB) {
+  // NOTE: this should be replaced by batched DGEMM
+  //       once atomic increment is supported there
+  for (size_type row = 0; row < block_dim; ++row) {
+    auto const row_offset = row * block_dim;
+    for (size_type col = 0; col < block_dim; ++col) {
+      auto v  = &dst[row_offset + col];
+      auto vb = valB + col;
+      for (const value_type *va = valA + row_offset, *end = va + block_dim;
+           va < end; ++va) {
+        Kokkos::atomic_add(v, (*va) * (*vb));
+        vb += block_dim;
+      }
+    }
+  }
+}
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#endif  //  _KOKKOSKERNELS_BLOCKUTILS_HPP
diff --git a/src/common/KokkosKernels_Error.hpp b/src/common/KokkosKernels_Error.hpp
index b2f41fd4f6..11bd7f6953 100644
--- a/src/common/KokkosKernels_Error.hpp
+++ b/src/common/KokkosKernels_Error.hpp
@@ -54,6 +54,30 @@ inline void throw_runtime_exception(const std::string &msg) {
   throw std::runtime_error(msg);
 }
 
+#if defined(KOKKOS_ENABLE_HIP)
+inline void hip_internal_error_throw(hipError_t e, const char *name,
+                                     const char *file, const int line) {
+  std::ostringstream out;
+  out << name << " error( " << hipGetErrorName(e)
+      << "): " << hipGetErrorString(e);
+  if (file) {
+    out << " " << file << ":" << line;
+  }
+  throw_runtime_exception(out.str());
+}
+
+inline void hip_internal_safe_call(hipError_t e, const char *name,
+                                   const char *file = nullptr,
+                                   const int line   = 0) {
+  if (hipSuccess != e) {
+    hip_internal_error_throw(e, name, file, line);
+  }
+}
+
+#define KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(call) \
+  hip_internal_safe_call(call, #call, __FILE__, __LINE__)
+#endif
+
 }  // namespace Impl
 }  // namespace KokkosKernels
 
diff --git a/src/common/KokkosKernels_ExecSpaceUtils.hpp b/src/common/KokkosKernels_ExecSpaceUtils.hpp
index 444d787963..41e750e93e 100644
--- a/src/common/KokkosKernels_ExecSpaceUtils.hpp
+++ b/src/common/KokkosKernels_ExecSpaceUtils.hpp
@@ -42,16 +42,17 @@
 //@HEADER
 */
 
+#ifndef _KOKKOSKERNELSUTILSEXECSPACEUTILS_HPP
+#define _KOKKOSKERNELSUTILSEXECSPACEUTILS_HPP
+
 #include "Kokkos_Core.hpp"
+#include "KokkosKernels_Error.hpp"
 
 #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU)
 #include <level_zero/zes_api.h>
 #include <CL/sycl/backend/level_zero.hpp>
 #endif
 
-#ifndef _KOKKOSKERNELSUTILSEXECSPACEUTILS_HPP
-#define _KOKKOSKERNELSUTILSEXECSPACEUTILS_HPP
-
 namespace KokkosKernels {
 
 namespace Impl {
@@ -64,6 +65,7 @@ enum ExecSpaceType {
   Exec_HIP,
   Exec_SYCL
 };
+
 template <typename ExecutionSpace>
 KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type() {
   ExecSpaceType exec_space = Exec_SERIAL;
@@ -205,7 +207,7 @@ inline void kk_get_free_total_memory<Kokkos::CudaHostPinnedSpace>(
 template <>
 inline void kk_get_free_total_memory<Kokkos::Experimental::HIPSpace>(
     size_t& free_mem, size_t& total_mem) {
-  hipMemGetInfo(&free_mem, &total_mem);
+  KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipMemGetInfo(&free_mem, &total_mem));
 }
 #endif
 
@@ -368,12 +370,12 @@ template <>
 struct SpaceInstance<Kokkos::Experimental::HIP> {
   static Kokkos::Experimental::HIP create() {
     hipStream_t stream;
-    hipStreamCreate(&stream);
+    KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream));
     return Kokkos::Experimental::HIP(stream);
   }
   static void destroy(Kokkos::Experimental::HIP& space) {
     hipStream_t stream = space.hip_stream();
-    hipStreamDestroy(stream);
+    KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(stream));
   }
   static bool overlap() {
     // TODO: does HIP have an equivalent for CUDA_LAUNCH_BLOCKING?
diff --git a/src/KokkosKernels_Half.hpp b/src/common/KokkosKernels_Half.hpp
similarity index 100%
rename from src/KokkosKernels_Half.hpp
rename to src/common/KokkosKernels_Half.hpp
diff --git a/src/common/KokkosKernels_HashmapAccumulator.hpp b/src/common/KokkosKernels_HashmapAccumulator.hpp
index b7f39f75c2..c6397fd9ea 100644
--- a/src/common/KokkosKernels_HashmapAccumulator.hpp
+++ b/src/common/KokkosKernels_HashmapAccumulator.hpp
@@ -344,12 +344,12 @@ struct HashmapAccumulator {
   // Insertion is sequential, no race condition for the insertion.
   // the mergeadd used in the numeric of KKMEM.
   KOKKOS_INLINE_FUNCTION
-  int sequential_insert_into_hash_mergeAdd_TrackHashes(
+  void sequential_insert_into_hash_mergeAdd_TrackHashes(
       key_type key, value_type value, size_type *used_size_,
       size_type *used_hash_size, size_type *used_hashes) {
     size_type hash, i, my_index;
 
-    if (key == -1) return __insert_success;
+    if (key == -1) return;
 
     // issue-508, TODO: ensure that i < __max_value_size, but
     // need information about length of keys, values, and hash_nexts first!
@@ -357,7 +357,7 @@ struct HashmapAccumulator {
     for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) {
       if (keys[i] == key) {
         values[i] = values[i] + value;
-        return __insert_success;
+        return;
       }
     }
 
@@ -371,7 +371,6 @@ struct HashmapAccumulator {
     hash_begins[hash] = my_index;
     keys[my_index]    = key;
     values[my_index]  = value;
-    return __insert_success;
   }
 
   // no values. simply adds to the keys.
diff --git a/src/common/KokkosKernels_IOUtils.hpp b/src/common/KokkosKernels_IOUtils.hpp
index bf1f3b4bfc..08e6f3cdc7 100644
--- a/src/common/KokkosKernels_IOUtils.hpp
+++ b/src/common/KokkosKernels_IOUtils.hpp
@@ -88,363 +88,6 @@ inline void getRandomBounds(double mag, Kokkos::complex<double> &start,
   end   = Kokkos::complex<double>(mag, mag);
 }
 
-// MD: Bases on Christian's sparseMatrix_generate function in test_crsmatrix.cpp
-// file.
-template <typename ScalarType, typename OrdinalType, typename SizeType>
-void kk_sparseMatrix_generate(OrdinalType nrows, OrdinalType ncols,
-                              SizeType &nnz, OrdinalType row_size_variance,
-                              OrdinalType bandwidth, ScalarType *&values,
-                              SizeType *&rowPtr, OrdinalType *&colInd) {
-  rowPtr = new SizeType[nrows + 1];
-
-  OrdinalType elements_per_row = nrows ? nnz / nrows : 0;
-  srand(13721);
-  rowPtr[0] = 0;
-  for (int row = 0; row < nrows; row++) {
-    int varianz       = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance;
-    int numRowEntries = elements_per_row + varianz;
-    if (numRowEntries < 0) numRowEntries = 0;
-    // Clamping numRowEntries above accomplishes 2 things:
-    //  - If ncols is 0, numRowEntries will also be 0
-    //  - With numRowEntries at most 2/3 the number of columns, in the worst
-    //  case
-    //    90% of insertions will succeed after 6 tries
-    if (numRowEntries > 0.66 * ncols) numRowEntries = 0.66 * ncols;
-    rowPtr[row + 1] = rowPtr[row] + numRowEntries;
-  }
-  nnz    = rowPtr[nrows];
-  values = new ScalarType[nnz];
-  colInd = new OrdinalType[nnz];
-  for (OrdinalType row = 0; row < nrows; row++) {
-    for (SizeType k = rowPtr[row]; k < rowPtr[row + 1]; ++k) {
-      while (true) {
-        OrdinalType pos = (1.0 * rand() / RAND_MAX - 0.5) * bandwidth + row;
-        while (pos < 0) pos += ncols;
-        while (pos >= ncols) pos -= ncols;
-
-        bool is_already_in_the_row = false;
-        for (SizeType j = rowPtr[row]; j < k; j++) {
-          if (colInd[j] == pos) {
-            is_already_in_the_row = true;
-            break;
-          }
-        }
-        if (!is_already_in_the_row) {
-          colInd[k] = pos;
-          break;
-        }
-      }
-    }
-  }
-  // Sample each value from uniform (-50, 50) for real types, or (-50 - 50i, 50
-  // + 50i) for complex types.
-  Kokkos::View<ScalarType *, Kokkos::HostSpace> valuesView(values, nnz);
-  ScalarType randStart, randEnd;
-  getRandomBounds(50.0, randStart, randEnd);
-  Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace> pool(13718);
-  Kokkos::fill_random(valuesView, pool, randStart, randEnd);
-}
-
-template <typename ScalarType, typename OrdinalType, typename SizeType>
-void kk_sparseMatrix_generate_lower_upper_triangle(
-    char uplo, OrdinalType nrows, OrdinalType ncols, SizeType &nnz,
-    OrdinalType /*row_size_variance*/, OrdinalType /*bandwidth*/,
-    ScalarType *&values, SizeType *&rowPtr, OrdinalType *&colInd) {
-  rowPtr = new SizeType[nrows + 1];
-
-  // OrdinalType elements_per_row = nnz/nrows;
-  srand(13721);
-  rowPtr[0] = 0;
-  for (int row = 0; row < nrows; row++) {
-    if (uplo == 'L')
-      rowPtr[row + 1] = rowPtr[row] + row + 1;
-    else
-      rowPtr[row + 1] = rowPtr[row] + ncols - (row);
-  }
-  nnz    = rowPtr[nrows];
-  values = new ScalarType[nnz];
-  colInd = new OrdinalType[nnz];
-  for (OrdinalType row = 0; row < nrows; row++) {
-    for (SizeType k = rowPtr[row]; k < rowPtr[row + 1]; k++) {
-      if (uplo == 'L')
-        colInd[k] = k - rowPtr[row];
-      else
-        colInd[k] = row + (k - rowPtr[row]);
-      values[k] = 1.0;
-    }
-  }
-}
-
-template <typename ScalarType, typename OrdinalType, typename SizeType>
-void kk_diagonally_dominant_sparseMatrix_generate(
-    OrdinalType nrows, OrdinalType ncols, SizeType &nnz,
-    OrdinalType row_size_variance, OrdinalType bandwidth, ScalarType *&values,
-    SizeType *&rowPtr, OrdinalType *&colInd,
-    ScalarType diagDominance = 10 * Kokkos::ArithTraits<ScalarType>::one()) {
-  rowPtr = new SizeType[nrows + 1];
-
-  OrdinalType elements_per_row = nnz / nrows;
-  srand(13721);
-  rowPtr[0] = 0;
-  for (int row = 0; row < nrows; row++) {
-    int varianz = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance;
-    if (varianz < 1) varianz = 1;
-    if (varianz > 0.75 * ncols) varianz = 0.75 * ncols;
-    rowPtr[row + 1] = rowPtr[row] + elements_per_row + varianz;
-    if (rowPtr[row + 1] <= rowPtr[row])   // This makes sure that there is
-      rowPtr[row + 1] = rowPtr[row] + 1;  // at least one nonzero in the row
-  }
-  nnz    = rowPtr[nrows];
-  values = new ScalarType[nnz];
-  colInd = new OrdinalType[nnz];
-  for (OrdinalType row = 0; row < nrows; row++) {
-    ScalarType total_values = 0;
-    std::unordered_set<OrdinalType> entriesInRow;
-    // We always add the diagonal entry (after this loop)
-    entriesInRow.insert(row);
-    for (SizeType k = rowPtr[row]; k < rowPtr[row + 1] - 1; k++) {
-      while (true) {
-        OrdinalType pos = (1.0 * rand() / RAND_MAX - 0.5) * bandwidth + row;
-        while (pos < 0) pos += ncols;
-        while (pos >= ncols) pos -= ncols;
-
-        if (entriesInRow.find(pos) == entriesInRow.end()) {
-          entriesInRow.insert(pos);
-          colInd[k] = pos;
-          values[k] = 100.0 * rand() / RAND_MAX - 50.0;
-          total_values +=
-              Kokkos::Details::ArithTraits<ScalarType>::abs(values[k]);
-          break;
-        }
-      }
-    }
-
-    colInd[rowPtr[row + 1] - 1] = row;
-    values[rowPtr[row + 1] - 1] = total_values * diagDominance;
-  }
-}
-
-// This function creates a diagonal sparse matrix for testing matrix operations.
-// The elements on the diagonal are 1, 2, ..., n-1, n.
-// If "invert" is true, it will return the inverse of the above diagonal matrix.
-template <typename crsMat_t>
-crsMat_t kk_generate_diag_matrix(typename crsMat_t::const_ordinal_type n,
-                                 const bool invert = false) {
-  typedef typename crsMat_t::ordinal_type ot;
-  typedef typename crsMat_t::StaticCrsGraphType graph_t;
-  typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
-  typedef typename graph_t::entries_type::non_const_type cols_view_t;
-  typedef typename crsMat_t::values_type::non_const_type values_view_t;
-
-  typedef typename row_map_view_t::non_const_value_type size_type;
-  typedef typename cols_view_t::non_const_value_type lno_t;
-  typedef typename values_view_t::non_const_value_type scalar_t;
-
-  row_map_view_t rowmap_view("rowmap_view", n + 1);
-  cols_view_t columns_view("colsmap_view", n);
-  values_view_t values_view("values_view", n);
-
-  {
-    typename row_map_view_t::HostMirror hr =
-        Kokkos::create_mirror_view(rowmap_view);
-    typename cols_view_t::HostMirror hc =
-        Kokkos::create_mirror_view(columns_view);
-    typename values_view_t::HostMirror hv =
-        Kokkos::create_mirror_view(values_view);
-
-    for (lno_t i = 0; i <= n; ++i) {
-      hr(i) = size_type(i);
-    }
-
-    for (ot i = 0; i < n; ++i) {
-      hc(i) = lno_t(i);
-      if (invert) {
-        hv(i) = scalar_t(1.0) / (scalar_t(i + 1));
-      } else {
-        hv(i) = scalar_t(i + 1);
-      }
-    }
-    Kokkos::deep_copy(rowmap_view, hr);
-    Kokkos::deep_copy(columns_view, hc);
-    Kokkos::deep_copy(values_view, hv);
-  }
-
-  graph_t static_graph(columns_view, rowmap_view);
-  crsMat_t crsmat("CrsMatrix", n, values_view, static_graph);
-  return crsmat;
-}
-
-template <typename crsMat_t>
-crsMat_t kk_generate_diagonally_dominant_sparse_matrix(
-    typename crsMat_t::const_ordinal_type nrows,
-    typename crsMat_t::const_ordinal_type ncols,
-    typename crsMat_t::non_const_size_type &nnz,
-    typename crsMat_t::const_ordinal_type row_size_variance,
-    typename crsMat_t::const_ordinal_type bandwidth,
-    typename crsMat_t::const_value_type diagDominance =
-        10 * Kokkos::ArithTraits<typename crsMat_t::value_type>::one()) {
-  typedef typename crsMat_t::StaticCrsGraphType graph_t;
-  typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
-  typedef typename graph_t::entries_type::non_const_type cols_view_t;
-  typedef typename crsMat_t::values_type::non_const_type values_view_t;
-
-  typedef typename row_map_view_t::non_const_value_type size_type;
-  typedef typename cols_view_t::non_const_value_type lno_t;
-  typedef typename values_view_t::non_const_value_type scalar_t;
-  lno_t *adj;
-  size_type *xadj;  //, nnzA;
-  scalar_t *values;
-
-  kk_diagonally_dominant_sparseMatrix_generate<scalar_t, lno_t, size_type>(
-      nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj,
-      diagDominance);
-
-  row_map_view_t rowmap_view("rowmap_view", nrows + 1);
-  cols_view_t columns_view("colsmap_view", nnz);
-  values_view_t values_view("values_view", nnz);
-
-  {
-    typename row_map_view_t::HostMirror hr =
-        Kokkos::create_mirror_view(rowmap_view);
-    typename cols_view_t::HostMirror hc =
-        Kokkos::create_mirror_view(columns_view);
-    typename values_view_t::HostMirror hv =
-        Kokkos::create_mirror_view(values_view);
-
-    for (lno_t i = 0; i <= nrows; ++i) {
-      hr(i) = xadj[i];
-    }
-
-    for (size_type i = 0; i < nnz; ++i) {
-      hc(i) = adj[i];
-      hv(i) = values[i];
-    }
-    Kokkos::deep_copy(rowmap_view, hr);
-    Kokkos::deep_copy(columns_view, hc);
-    Kokkos::deep_copy(values_view, hv);
-  }
-
-  graph_t static_graph(columns_view, rowmap_view);
-  crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph);
-  delete[] xadj;
-  delete[] adj;
-  delete[] values;
-  return crsmat;
-}
-
-template <typename crsMat_t>
-crsMat_t kk_generate_triangular_sparse_matrix(
-    char uplo, typename crsMat_t::const_ordinal_type nrows,
-    typename crsMat_t::const_ordinal_type ncols,
-    typename crsMat_t::non_const_size_type &nnz,
-    typename crsMat_t::const_ordinal_type row_size_variance,
-    typename crsMat_t::const_ordinal_type bandwidth) {
-  typedef typename crsMat_t::StaticCrsGraphType graph_t;
-  typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
-  typedef typename graph_t::entries_type::non_const_type cols_view_t;
-  typedef typename crsMat_t::values_type::non_const_type values_view_t;
-
-  typedef typename row_map_view_t::non_const_value_type size_type;
-  typedef typename cols_view_t::non_const_value_type lno_t;
-  typedef typename values_view_t::non_const_value_type scalar_t;
-  lno_t *adj;
-  size_type *xadj;  //, nnzA;
-  scalar_t *values;
-
-  kk_sparseMatrix_generate_lower_upper_triangle<scalar_t, lno_t, size_type>(
-      uplo, nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj);
-
-  row_map_view_t rowmap_view("rowmap_view", nrows + 1);
-  cols_view_t columns_view("colsmap_view", nnz);
-  values_view_t values_view("values_view", nnz);
-
-  {
-    typename row_map_view_t::HostMirror hr =
-        Kokkos::create_mirror_view(rowmap_view);
-    typename cols_view_t::HostMirror hc =
-        Kokkos::create_mirror_view(columns_view);
-    typename values_view_t::HostMirror hv =
-        Kokkos::create_mirror_view(values_view);
-
-    for (lno_t i = 0; i <= nrows; ++i) {
-      hr(i) = xadj[i];
-    }
-
-    for (size_type i = 0; i < nnz; ++i) {
-      hc(i) = adj[i];
-      hv(i) = values[i];
-    }
-    Kokkos::deep_copy(rowmap_view, hr);
-    Kokkos::deep_copy(columns_view, hc);
-    Kokkos::deep_copy(values_view, hv);
-    Kokkos::fence();
-  }
-
-  graph_t static_graph(columns_view, rowmap_view);
-  crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph);
-  delete[] xadj;
-  delete[] adj;
-  delete[] values;
-  return crsmat;
-}
-
-template <typename crsMat_t>
-crsMat_t kk_generate_sparse_matrix(
-    typename crsMat_t::const_ordinal_type nrows,
-    typename crsMat_t::const_ordinal_type ncols,
-    typename crsMat_t::non_const_size_type &nnz,
-    typename crsMat_t::const_ordinal_type row_size_variance,
-    typename crsMat_t::const_ordinal_type bandwidth) {
-  typedef typename crsMat_t::StaticCrsGraphType graph_t;
-  typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
-  typedef typename graph_t::entries_type::non_const_type cols_view_t;
-  typedef typename crsMat_t::values_type::non_const_type values_view_t;
-
-  typedef typename row_map_view_t::non_const_value_type size_type;
-  typedef typename cols_view_t::non_const_value_type lno_t;
-  typedef typename values_view_t::non_const_value_type scalar_t;
-  lno_t *adj;
-  size_type *xadj;  //, nnzA;
-  scalar_t *values;
-
-  kk_sparseMatrix_generate<scalar_t, lno_t, size_type>(
-      nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj);
-
-  row_map_view_t rowmap_view("rowmap_view", nrows + 1);
-  cols_view_t columns_view("colsmap_view", nnz);
-  values_view_t values_view("values_view", nnz);
-
-  {
-    typename row_map_view_t::HostMirror hr =
-        Kokkos::create_mirror_view(rowmap_view);
-    typename cols_view_t::HostMirror hc =
-        Kokkos::create_mirror_view(columns_view);
-    typename values_view_t::HostMirror hv =
-        Kokkos::create_mirror_view(values_view);
-
-    for (lno_t i = 0; i <= nrows; ++i) {
-      hr(i) = xadj[i];
-    }
-
-    for (size_type i = 0; i < nnz; ++i) {
-      hc(i) = adj[i];
-      hv(i) = values[i];
-    }
-    Kokkos::deep_copy(rowmap_view, hr);
-    Kokkos::deep_copy(columns_view, hc);
-    Kokkos::deep_copy(values_view, hv);
-  }
-
-  graph_t static_graph(columns_view, rowmap_view);
-  crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph);
-  delete[] xadj;
-  delete[] adj;
-  delete[] values;
-  return crsmat;
-}
-
-// TODO: need to fix the size_type. All over the reading inputs are lno_t.
-
 template <typename stype>
 void md_malloc(stype **arr, size_t n, std::string /*alloc_str*/ = "") {
   *arr = new stype[n];
@@ -550,133 +193,85 @@ inline void kk_read_1Dview_from_file(idx_array_type &view,
   Kokkos::fence();
 }
 
-template <typename idx>
-void convert_crs_to_lower_triangle_edge_list(idx nv, idx *xadj, idx *adj,
-                                             idx *lower_triangle_srcs,
-                                             idx *lower_triangle_dests) {
-  idx ind = 0;
-  for (idx i = 0; i < nv; ++i) {
-    idx xb = xadj[i];
-    idx xe = xadj[i + 1];
-    for (idx j = xb; j < xe; ++j) {
-      idx dst = adj[j];
-      if (i < dst) {
-        lower_triangle_srcs[ind]    = i;
-        lower_triangle_dests[ind++] = dst;
-      }
-    }
-  }
-}
-
-template <typename idx>
-void convert_crs_to_edge_list(idx nv, idx *xadj, idx *srcs) {
-  for (idx i = 0; i < nv; ++i) {
-    idx xb = xadj[i];
-    idx xe = xadj[i + 1];
-    for (idx j = xb; j < xe; ++j) {
-      srcs[j] = i;
+template <typename idx_array_type>
+inline void kk_write_2Dview_to_file(idx_array_type view, const char *filename) {
+  typedef typename idx_array_type::HostMirror host_type;
+  // typedef typename idx_array_type::size_type idx;
+  host_type host_view = Kokkos::create_mirror_view(view);
+  Kokkos::deep_copy(host_view, view);
+  Kokkos::fence();
+  std::ofstream myFile(filename, std::ios::out);
+  for (size_t i = 0; i < view.extent(0); ++i) {
+    for (size_t j = 0; j < view.extent(1); ++j) {
+      myFile << host_view(i, j) << " ";
     }
+    myFile << std::endl;
   }
+  myFile.close();
 }
 
-template <typename size_type, typename lno_t, typename wt>
-void convert_edge_list_to_csr(lno_t nv, size_type ne, lno_t *srcs, lno_t *dests,
-                              wt *ew, size_type *xadj, lno_t *adj, wt *crs_ew) {
-  std::vector<struct Edge<lno_t, wt>> edges(ne);
-  for (size_type i = 0; i < ne; ++i) {
-    edges[i].src = srcs[i];
-    edges[i].dst = dests[i];
-    edges[i].ew  = ew[i];
-  }
-  std::sort(edges.begin(), edges.begin() + ne);
+template <typename idx_array_type>
+inline void kk_read_2Dview_from_file(idx_array_type &view,
+                                     const char *filename) {
+  typedef typename idx_array_type::HostMirror host_type;
+  // typedef typename idx_array_type::size_type idx;
+  host_type host_view = Kokkos::create_mirror_view(view);
+  std::ifstream myFile(filename, std::ios::in);
 
-  size_type eind = 0;
-  for (lno_t i = 0; i < nv; ++i) {
-    (xadj)[i] = eind;
-    while (edges[eind].src == i) {
-      (adj)[eind]     = edges[eind].dst;
-      (*crs_ew)[eind] = edges[eind].ew;
-      ++eind;
+  for (size_t i = 0; i < view.extent(0); ++i) {
+    for (size_t j = 0; j < view.extent(1); ++j) {
+      myFile >> host_view(i, j);
     }
   }
-  xadj[nv] = eind;
+  myFile.close();
+  Kokkos::deep_copy(view, host_view);
+  Kokkos::fence();
 }
 
-template <typename in_lno_t, typename size_type, typename lno_t>
-void convert_undirected_edge_list_to_csr(lno_t nv, size_type ne, in_lno_t *srcs,
-                                         in_lno_t *dests, size_type *xadj,
-                                         lno_t *adj) {
-  std::vector<struct Edge<lno_t, double>> edges(ne * 2);
-  for (size_type i = 0; i < ne; ++i) {
-    edges[i * 2].src = srcs[i];
-    edges[i * 2].dst = dests[i];
-
-    edges[i * 2 + 1].src = dests[i];
-    edges[i * 2 + 1].dst = srcs[i];
-  }
-#ifdef KOKKOSKERNELS_HAVE_OUTER
-#include <parallel/multiseq_selection.h>
-#include <parallel/multiway_merge.h>
-#include <parallel/merge.h>
-#include <parallel/multiway_mergesort.h>
-  __gnu_parallel::parallel_sort_mwms<false, true, struct Edge<lno_t, double> *>(
-      &(edges[0]), &(edges[0]) + ne * 2,
-      std::less<struct Edge<lno_t, double>>(), 64);
-#else
-  std::sort(edges.begin(), edges.begin() + ne * 2);
-#endif
-
-  size_type eind = 0;
-  for (lno_t i = 0; i < nv; ++i) {
-    (xadj)[i] = eind;
-    while (edges[eind].src == i) {
-      (adj)[eind] = edges[eind].dst;
-      //(*crs_ew)[eind] = edges[eind].ew;
-      ++eind;
+template <typename idx_array_type>
+inline void kk_write_3Dview_to_file(idx_array_type view, const char *filename) {
+  typedef typename idx_array_type::HostMirror host_type;
+  // typedef typename idx_array_type::size_type idx;
+  host_type host_view = Kokkos::create_mirror_view(view);
+  Kokkos::deep_copy(host_view, view);
+  Kokkos::fence();
+  std::ofstream myFile(filename, std::ios::out);
+  for (size_t i = 0; i < view.extent(0); ++i) {
+    for (size_t j = 0; j < view.extent(1); ++j) {
+      for (size_t k = 0; k < view.extent(2); ++k) {
+        myFile << host_view(i, j, k) << " ";
+      }
+      myFile << std::endl;
     }
+    myFile << std::endl;
   }
-  xadj[nv] = eind;
+  myFile.close();
 }
-/*
-
-template <typename lno_t, typename size_type, typename scalar_t>
-void read_graph_src_dst_bin(
-    lno_t *nv, size_type *ne
-    ,size_type **xadj, lno_t **adj, scalar_t **ew,
-    const char *fnameSrc, const char *fnameTarg){
 
-  size_t numEdges = 0;
-  size_t *srcs, *dst; //this type is hard coded
-  buildEdgeListFromBinSrcTarg_undirected(
-      fnameSrc, fnameTarg,
-      &numEdges,
-      &srcs, &dst);
+template <typename idx_array_type>
+inline void kk_read_3Dview_from_file(idx_array_type &view,
+                                     const char *filename) {
+  typedef typename idx_array_type::HostMirror host_type;
+  // typedef typename idx_array_type::size_type idx;
+  host_type host_view = Kokkos::create_mirror_view(view);
+  std::ifstream myFile(filename, std::ios::in);
 
-  lno_t num_vertex = 0;
-  for (size_t i = 0; i < numEdges; ++i){
-    if (num_vertex < srcs[i]) num_vertex = srcs[i];
-    if (num_vertex < dst[i]) num_vertex = dst[i];
+  for (size_t i = 0; i < view.extent(0); ++i) {
+    for (size_t j = 0; j < view.extent(1); ++j) {
+      for (size_t k = 0; k < view.extent(2); ++k) {
+        myFile >> host_view(i, j, k);
+      }
+    }
   }
-  num_vertex += 1;
-
-  *nv = num_vertex;
-  *ne = numEdges * 2;
-
-  md_malloc<size_type>(xadj, num_vertex + 1);
-  md_malloc<lno_t>(adj, numEdges * 2);
-  convert_undirected_edge_list_to_csr (
-      num_vertex, numEdges,
-      srcs, dst,
-      *xadj, *adj);
-
-  delete [] srcs;
-  delete [] dst;
+  myFile.close();
+  Kokkos::deep_copy(view, host_view);
+  Kokkos::fence();
 }
-*/
 
 template <typename idx, typename wt>
-void write_edgelist_bin(size_t ne, const idx *edge_begins, const idx *edge_ends,
-                        const wt *ew, const char *filename) {
+[[deprecated]] void write_edgelist_bin(size_t ne, const idx *edge_begins,
+                                       const idx *edge_ends, const wt *ew,
+                                       const char *filename) {
   std::ofstream myFile(filename, std::ios::out | std::ios::binary);
   myFile.write((char *)&ne, sizeof(idx));
   myFile.write((char *)edge_begins, sizeof(idx) * (ne));
@@ -700,270 +295,6 @@ void read_edgelist_bin(idx *ne, idx **edge_begins, idx **edge_ends, wt **ew,
   myFile.close();
 }
 
-template <typename lno_t, typename size_type, typename scalar_t>
-void write_graph_bin(lno_t nv, size_type ne, const size_type *xadj,
-                     const lno_t *adj, const scalar_t *ew,
-                     const char *filename) {
-  std::ofstream myFile(filename, std::ios::out | std::ios::binary);
-  myFile.write((char *)&nv, sizeof(lno_t));
-  myFile.write((char *)&ne, sizeof(size_type));
-  myFile.write((char *)xadj, sizeof(size_type) * (nv + 1));
-
-  myFile.write((char *)adj, sizeof(lno_t) * (ne));
-
-  myFile.write((char *)ew, sizeof(scalar_t) * (ne));
-
-  myFile.close();
-}
-
-template <typename lno_t, typename size_type, typename scalar_t>
-void write_graph_crs(lno_t nv, size_type ne, const size_type *xadj,
-                     const lno_t *adj, const scalar_t *ew,
-                     const char *filename) {
-  std::ofstream myFile(filename, std::ios::out);
-  myFile << nv << " " << ne << std::endl;
-
-  for (lno_t i = 0; i <= nv; ++i) {
-    myFile << xadj[i] << " ";
-  }
-  myFile << std::endl;
-
-  for (lno_t i = 0; i < nv; ++i) {
-    size_type b = xadj[i];
-    size_type e = xadj[i + 1];
-    for (size_type j = b; j < e; ++j) {
-      myFile << adj[j] << " ";
-    }
-    myFile << std::endl;
-  }
-  for (size_type i = 0; i < ne; ++i) {
-    myFile << ew[i] << " ";
-  }
-  myFile << std::endl;
-
-  myFile.close();
-}
-
-template <typename lno_t, typename size_type, typename scalar_t>
-void write_graph_ligra(lno_t nv, size_type ne, const size_type *xadj,
-                       const lno_t *adj, const scalar_t * /*ew*/,
-                       const char *filename) {
-  std::ofstream ff(filename);
-  ff << "AdjacencyGraph" << std::endl;
-  ff << nv << std::endl << ne << std::endl;
-  for (lno_t i = 0; i < nv; ++i) {
-    ff << xadj[i] << std::endl;
-  }
-  for (size_type i = 0; i < ne; ++i) {
-    ff << adj[i] << std::endl;
-  }
-  ff.close();
-}
-
-// MM: types and utility functions for parsing the MatrixMarket format
-namespace MM {
-enum MtxObject { UNDEFINED_OBJECT, MATRIX, VECTOR };
-enum MtxFormat { UNDEFINED_FORMAT, COORDINATE, ARRAY };
-enum MtxField {
-  UNDEFINED_FIELD,
-  REAL,     // includes both float and double
-  COMPLEX,  // includes complex<float> and complex<double>
-  INTEGER,  // includes all integer types
-  PATTERN   // not a type, but means the value for every entry is 1
-};
-enum MtxSym {
-  UNDEFINED_SYMMETRY,
-  GENERAL,
-  SYMMETRIC,       // A(i, j) = A(j, i)
-  SKEW_SYMMETRIC,  // A(i, j) = -A(j, i)
-  HERMITIAN        // A(i, j) = a + bi; A(j, i) = a - bi
-};
-
-// readScalar/writeScalar: read and write a scalar in the form that it appears
-// in an .mtx file. The >> and << operators won't work, because complex appears
-// as "real imag", not "(real, imag)"
-template <typename scalar_t>
-scalar_t readScalar(std::istream &is) {
-  scalar_t val;
-  is >> val;
-  return val;
-}
-
-template <>
-inline Kokkos::complex<float> readScalar(std::istream &is) {
-  float r, i;
-  is >> r;
-  is >> i;
-  return Kokkos::complex<float>(r, i);
-}
-
-template <>
-inline Kokkos::complex<double> readScalar(std::istream &is) {
-  double r, i;
-  is >> r;
-  is >> i;
-  return Kokkos::complex<double>(r, i);
-}
-
-template <typename scalar_t>
-void writeScalar(std::ostream &os, scalar_t val) {
-  os << val;
-}
-
-template <>
-inline void writeScalar(std::ostream &os, Kokkos::complex<float> val) {
-  os << val.real() << ' ' << val.imag();
-}
-
-template <>
-inline void writeScalar(std::ostream &os, Kokkos::complex<double> val) {
-  os << val.real() << ' ' << val.imag();
-}
-
-// symmetryFlip: given a value for A(i, j), return the value that
-// should be inserted at A(j, i) (if any)
-template <typename scalar_t>
-scalar_t symmetryFlip(scalar_t val, MtxSym symFlag) {
-  if (symFlag == SKEW_SYMMETRIC) return -val;
-  return val;
-}
-
-template <>
-inline Kokkos::complex<float> symmetryFlip(Kokkos::complex<float> val,
-                                           MtxSym symFlag) {
-  if (symFlag == HERMITIAN)
-    return Kokkos::conj(val);
-  else if (symFlag == SKEW_SYMMETRIC)
-    return -val;
-  return val;
-}
-
-template <>
-inline Kokkos::complex<double> symmetryFlip(Kokkos::complex<double> val,
-                                            MtxSym symFlag) {
-  if (symFlag == HERMITIAN)
-    return Kokkos::conj(val);
-  else if (symFlag == SKEW_SYMMETRIC)
-    return -val;
-  return val;
-}
-}  // namespace MM
-
-template <typename lno_t, typename size_type, typename scalar_t>
-void write_matrix_mtx(lno_t nrows, lno_t ncols, size_type nentries,
-                      const size_type *xadj, const lno_t *adj,
-                      const scalar_t *vals, const char *filename) {
-  std::ofstream myFile(filename);
-  myFile << "%%MatrixMarket matrix coordinate ";
-  if (std::is_same<scalar_t, Kokkos::complex<float>>::value ||
-      std::is_same<scalar_t, Kokkos::complex<double>>::value)
-    myFile << "complex";
-  else
-    myFile << "real";
-  myFile << " general\n";
-  myFile << nrows << " " << ncols << " " << nentries << '\n';
-  myFile << std::setprecision(17) << std::scientific;
-  for (lno_t i = 0; i < nrows; ++i) {
-    size_type b = xadj[i];
-    size_type e = xadj[i + 1];
-    for (size_type j = b; j < e; ++j) {
-      myFile << i + 1 << " " << adj[j] + 1 << " ";
-      MM::writeScalar<scalar_t>(myFile, vals[j]);
-      myFile << '\n';
-    }
-  }
-  myFile.close();
-}
-
-template <typename lno_t, typename size_type, typename scalar_t>
-void write_graph_mtx(lno_t nv, size_type ne, const size_type *xadj,
-                     const lno_t *adj, const scalar_t *ew,
-                     const char *filename) {
-  std::ofstream myFile(filename);
-  myFile << "%%MatrixMarket matrix coordinate ";
-  if (std::is_same<scalar_t, Kokkos::complex<float>>::value ||
-      std::is_same<scalar_t, Kokkos::complex<double>>::value)
-    myFile << "complex";
-  else
-    myFile << "real";
-  myFile << " general\n";
-  myFile << nv << " " << nv << " " << ne << '\n';
-  myFile << std::setprecision(8) << std::scientific;
-  for (lno_t i = 0; i < nv; ++i) {
-    size_type b = xadj[i];
-    size_type e = xadj[i + 1];
-    for (size_type j = b; j < e; ++j) {
-      myFile << i + 1 << " " << (adj)[j] + 1 << " ";
-      MM::writeScalar<scalar_t>(myFile, ew[j]);
-      myFile << '\n';
-    }
-  }
-
-  myFile.close();
-}
-
-template <typename lno_t, typename size_type, typename scalar_t>
-void read_graph_bin(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj,
-                    scalar_t **ew, const char *filename) {
-  std::ifstream myFile(filename, std::ios::in | std::ios::binary);
-
-  myFile.read((char *)nv, sizeof(lno_t));
-  myFile.read((char *)ne, sizeof(size_type));
-  md_malloc<size_type>(xadj, *nv + 1);
-  md_malloc<lno_t>(adj, *ne);
-  md_malloc<scalar_t>(ew, *ne);
-  myFile.read((char *)*xadj, sizeof(size_type) * (*nv + 1));
-  myFile.read((char *)*adj, sizeof(lno_t) * (*ne));
-  myFile.read((char *)*ew, sizeof(scalar_t) * (*ne));
-  myFile.close();
-}
-
-// When Kokkos issue #2313 is resolved, can delete
-// parseScalar and just use operator>>
-template <typename scalar_t>
-scalar_t parseScalar(std::istream &is) {
-  scalar_t val;
-  is >> val;
-  return val;
-}
-
-template <>
-inline Kokkos::complex<float> parseScalar(std::istream &is) {
-  std::complex<float> val;
-  is >> val;
-  return Kokkos::complex<float>(val);
-}
-
-template <>
-inline Kokkos::complex<double> parseScalar(std::istream &is) {
-  std::complex<double> val;
-  is >> val;
-  return Kokkos::complex<double>(val);
-}
-
-template <typename lno_t, typename size_type, typename scalar_t>
-void read_graph_crs(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj,
-                    scalar_t **ew, const char *filename) {
-  std::ifstream myFile(filename, std::ios::in);
-  myFile >> *nv >> *ne;
-
-  md_malloc<size_type>(xadj, *nv + 1);
-  md_malloc<lno_t>(adj, *ne);
-  md_malloc<scalar_t>(ew, *ne);
-
-  for (lno_t i = 0; i <= *nv; ++i) {
-    myFile >> (*xadj)[i];
-  }
-
-  for (size_type i = 0; i < *ne; ++i) {
-    myFile >> (*adj)[i];
-  }
-  for (size_type i = 0; i < *ne; ++i) {
-    (*ew)[i] = parseScalar<scalar_t>(myFile);
-  }
-  myFile.close();
-}
-
 inline bool endswith(std::string const &fullString, std::string const &ending) {
   if (fullString.length() >= ending.length()) {
     return (0 == fullString.compare(fullString.length() - ending.length(),
@@ -973,491 +304,6 @@ inline bool endswith(std::string const &fullString, std::string const &ending) {
   }
 }
 
-template <typename crs_matrix_t>
-void write_kokkos_crst_matrix(crs_matrix_t a_crsmat, const char *filename) {
-  typedef typename crs_matrix_t::StaticCrsGraphType graph_t;
-  typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
-  typedef typename graph_t::entries_type::non_const_type cols_view_t;
-  typedef typename crs_matrix_t::values_type::non_const_type values_view_t;
-
-  typedef typename row_map_view_t::value_type offset_t;
-  typedef typename cols_view_t::value_type lno_t;
-  typedef typename values_view_t::value_type scalar_t;
-  typedef typename values_view_t::size_type size_type;
-
-  size_type nnz = a_crsmat.nnz();
-
-  auto a_rowmap_view = Kokkos::create_mirror_view_and_copy(
-      Kokkos::HostSpace(), a_crsmat.graph.row_map);
-  auto a_entries_view = Kokkos::create_mirror_view_and_copy(
-      Kokkos::HostSpace(), a_crsmat.graph.entries);
-  auto a_values_view =
-      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a_crsmat.values);
-  offset_t *a_rowmap = const_cast<offset_t *>(a_rowmap_view.data());
-  lno_t *a_entries   = a_entries_view.data();
-  scalar_t *a_values = a_values_view.data();
-
-  std::string strfilename(filename);
-  if (endswith(strfilename, ".mtx") || endswith(strfilename, ".mm")) {
-    write_matrix_mtx<lno_t, offset_t, scalar_t>(
-        a_crsmat.numRows(), a_crsmat.numCols(), a_crsmat.nnz(), a_rowmap,
-        a_entries, a_values, filename);
-    return;
-  } else if (a_crsmat.numRows() != a_crsmat.numCols()) {
-    throw std::runtime_error(
-        "For formats other than MatrixMarket (suffix .mm or .mtx),\n"
-        "write_kokkos_crst_matrix only supports square matrices");
-  }
-  if (endswith(strfilename, ".bin")) {
-    write_graph_bin<lno_t, offset_t, scalar_t>(
-        a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename);
-  } else if (endswith(strfilename, ".ligra")) {
-    write_graph_ligra<lno_t, offset_t, scalar_t>(
-        a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename);
-  } else if (endswith(strfilename, ".crs")) {
-    write_graph_crs<lno_t, offset_t, scalar_t>(
-        a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename);
-  } else {
-    std::string errMsg =
-        std::string("write_kokkos_crst_matrix: File extension on ") + filename +
-        " does not correspond to a known format";
-    throw std::runtime_error(errMsg);
-  }
-}
-
-template <typename lno_t, typename size_type, typename scalar_t>
-int read_mtx(const char *fileName, lno_t *nrows, lno_t *ncols, size_type *ne,
-             size_type **xadj, lno_t **adj, scalar_t **ew,
-             bool symmetrize = false, bool remove_diagonal = true,
-             bool transpose = false) {
-  using namespace MM;
-  std::ifstream mmf(fileName, std::ifstream::in);
-  if (!mmf.is_open()) {
-    throw std::runtime_error("File cannot be opened\n");
-  }
-
-  std::string fline = "";
-  getline(mmf, fline);
-
-  if (fline.size() < 2 || fline[0] != '%' || fline[1] != '%') {
-    throw std::runtime_error("Invalid MM file. Line-1\n");
-  }
-
-  // make sure every required field is in the file, by initializing them to
-  // UNDEFINED_*
-  MtxObject mtx_object = UNDEFINED_OBJECT;
-  MtxFormat mtx_format = UNDEFINED_FORMAT;
-  MtxField mtx_field   = UNDEFINED_FIELD;
-  MtxSym mtx_sym       = UNDEFINED_SYMMETRY;
-
-  if (fline.find("matrix") != std::string::npos) {
-    mtx_object = MATRIX;
-  } else if (fline.find("vector") != std::string::npos) {
-    mtx_object = VECTOR;
-    throw std::runtime_error(
-        "MatrixMarket \"vector\" is not supported by KokkosKernels read_mtx()");
-  }
-
-  if (fline.find("coordinate") != std::string::npos) {
-    // sparse
-    mtx_format = COORDINATE;
-  } else if (fline.find("array") != std::string::npos) {
-    // dense
-    mtx_format = ARRAY;
-  }
-
-  if (fline.find("real") != std::string::npos ||
-      fline.find("double") != std::string::npos) {
-    if (std::is_same<scalar_t, Kokkos::Experimental::half_t>::value ||
-        std::is_same<scalar_t, Kokkos::Experimental::bhalf_t>::value)
-      mtx_field = REAL;
-    else {
-      if (!std::is_floating_point<scalar_t>::value)
-        throw std::runtime_error(
-            "scalar_t in read_mtx() incompatible with float or double typed "
-            "MatrixMarket file.");
-      else
-        mtx_field = REAL;
-    }
-  } else if (fline.find("complex") != std::string::npos) {
-    if (!(std::is_same<scalar_t, Kokkos::complex<float>>::value ||
-          std::is_same<scalar_t, Kokkos::complex<double>>::value))
-      throw std::runtime_error(
-          "scalar_t in read_mtx() incompatible with complex-typed MatrixMarket "
-          "file.");
-    else
-      mtx_field = COMPLEX;
-  } else if (fline.find("integer") != std::string::npos) {
-    if (std::is_integral<scalar_t>::value ||
-        std::is_floating_point<scalar_t>::value ||
-        std::is_same<scalar_t, Kokkos::Experimental::half_t>::value ||
-        std::is_same<scalar_t, Kokkos::Experimental::bhalf_t>::value)
-      mtx_field = INTEGER;
-    else
-      throw std::runtime_error(
-          "scalar_t in read_mtx() incompatible with integer-typed MatrixMarket "
-          "file.");
-  } else if (fline.find("pattern") != std::string::npos) {
-    mtx_field = PATTERN;
-    // any reasonable choice for scalar_t can represent "1" or "1.0 + 0i", so
-    // nothing to check here
-  }
-
-  if (fline.find("general") != std::string::npos) {
-    mtx_sym = GENERAL;
-  } else if (fline.find("skew-symmetric") != std::string::npos) {
-    mtx_sym = SKEW_SYMMETRIC;
-  } else if (fline.find("symmetric") != std::string::npos) {
-    // checking for "symmetric" after "skew-symmetric" because it's a substring
-    mtx_sym = SYMMETRIC;
-  } else if (fline.find("hermitian") != std::string::npos ||
-             fline.find("Hermitian") != std::string::npos) {
-    mtx_sym = HERMITIAN;
-  }
-  // Validate the matrix attributes
-  if (mtx_format == ARRAY) {
-    if (mtx_sym == UNDEFINED_SYMMETRY) mtx_sym = GENERAL;
-    if (mtx_sym != GENERAL)
-      throw std::runtime_error(
-          "array format MatrixMarket file must have general symmetry (optional "
-          "to include \"general\")");
-  }
-  if (mtx_object == UNDEFINED_OBJECT)
-    throw std::runtime_error(
-        "MatrixMarket file header is missing the object type.");
-  if (mtx_format == UNDEFINED_FORMAT)
-    throw std::runtime_error("MatrixMarket file header is missing the format.");
-  if (mtx_field == UNDEFINED_FIELD)
-    throw std::runtime_error(
-        "MatrixMarket file header is missing the field type.");
-  if (mtx_sym == UNDEFINED_SYMMETRY)
-    throw std::runtime_error(
-        "MatrixMarket file header is missing the symmetry type.");
-
-  while (1) {
-    getline(mmf, fline);
-    if (fline[0] != '%') break;
-  }
-  std::stringstream ss(fline);
-  lno_t nr = 0, nc = 0;
-  size_type nnz = 0;
-  ss >> nr >> nc;
-  if (mtx_format == COORDINATE)
-    ss >> nnz;
-  else
-    nnz = nr * nc;
-  size_type numEdges = nnz;
-  symmetrize         = symmetrize || mtx_sym != GENERAL;
-  if (symmetrize && nr != nc) {
-    throw std::runtime_error("A non-square matrix cannot be symmetrized.");
-  }
-  if (mtx_format == ARRAY) {
-    // Array format only supports general symmetry and non-pattern
-    if (symmetrize)
-      throw std::runtime_error(
-          "array format MatrixMarket file cannot be symmetrized.");
-    if (mtx_field == PATTERN)
-      throw std::runtime_error(
-          "array format MatrixMarket file can't have \"pattern\" field type.");
-  }
-  if (symmetrize) {
-    numEdges = 2 * nnz;
-  }
-  // numEdges is only an upper bound (diagonal entries may be removed)
-  std::vector<struct Edge<lno_t, scalar_t>> edges(numEdges);
-  size_type nE      = 0;
-  lno_t numDiagonal = 0;
-  for (size_type i = 0; i < nnz; ++i) {
-    getline(mmf, fline);
-    std::stringstream ss2(fline);
-    struct Edge<lno_t, scalar_t> tmp;
-    // read source, dest (edge) and weight (value)
-    lno_t s, d;
-    scalar_t w;
-    if (mtx_format == ARRAY) {
-      // In array format, entries are listed in column major order,
-      // so the row and column can be determined just from the index i
-      //(but make them 1-based indices, to match the way coordinate works)
-      s = i % nr + 1;  // row
-      d = i / nr + 1;  // col
-    } else {
-      // In coordinate format, row and col of each entry is read from file
-      ss2 >> s >> d;
-    }
-    if (mtx_field == PATTERN)
-      w = 1;
-    else
-      w = readScalar<scalar_t>(ss2);
-    if (!transpose) {
-      tmp.src = s - 1;
-      tmp.dst = d - 1;
-      tmp.ew  = w;
-    } else {
-      tmp.src = d - 1;
-      tmp.dst = s - 1;
-      tmp.ew  = w;
-    }
-    if (tmp.src == tmp.dst) {
-      numDiagonal++;
-      if (!remove_diagonal) {
-        edges[nE++] = tmp;
-      }
-      continue;
-    }
-    edges[nE++] = tmp;
-    if (symmetrize) {
-      struct Edge<lno_t, scalar_t> tmp2;
-      tmp2.src = tmp.dst;
-      tmp2.dst = tmp.src;
-      // the symmetrized value is w, -w or conj(w) if mtx_sym is
-      // SYMMETRIC, SKEW_SYMMETRIC or HERMITIAN, respectively.
-      tmp2.ew     = symmetryFlip<scalar_t>(tmp.ew, mtx_sym);
-      edges[nE++] = tmp2;
-    }
-  }
-  mmf.close();
-  std::sort(edges.begin(), edges.begin() + nE);
-  if (transpose) {
-    lno_t tmp = nr;
-    nr        = nc;
-    nc        = tmp;
-  }
-  // idx *nv, idx *ne, idx **xadj, idx **adj, wt **wt
-  *nrows = nr;
-  *ncols = nc;
-  *ne    = nE;
-  //*xadj = new idx[nr + 1];
-  md_malloc<size_type>(xadj, nr + 1);
-  //*adj = new idx[nE];
-  md_malloc<lno_t>(adj, nE);
-  //*ew = new wt[nE];
-  md_malloc<scalar_t>(ew, nE);
-  size_type eind   = 0;
-  size_type actual = 0;
-  for (lno_t i = 0; i < nr; ++i) {
-    (*xadj)[i]    = actual;
-    bool is_first = true;
-    while (eind < nE && edges[eind].src == i) {
-      if (is_first || !symmetrize || eind == 0 ||
-          (eind > 0 && edges[eind - 1].dst != edges[eind].dst)) {
-        (*adj)[actual] = edges[eind].dst;
-        (*ew)[actual]  = edges[eind].ew;
-        ++actual;
-      }
-      is_first = false;
-      ++eind;
-    }
-  }
-  (*xadj)[nr] = actual;
-  *ne         = actual;
-  return 0;
-}
-
-// Version of read_mtx which does not capture the number of columns.
-// This is the old interface; it's kept for backwards compatibility.
-template <typename lno_t, typename size_type, typename scalar_t>
-int read_mtx(const char *fileName, lno_t *nv, size_type *ne, size_type **xadj,
-             lno_t **adj, scalar_t **ew, bool symmetrize = false,
-             bool remove_diagonal = true, bool transpose = false) {
-  lno_t ncol;  // will discard
-  return read_mtx<lno_t, size_type, scalar_t>(fileName, nv, &ncol, ne, xadj,
-                                              adj, ew, symmetrize,
-                                              remove_diagonal, transpose);
-}
-
-template <typename lno_t, typename size_type, typename scalar_t>
-void read_matrix(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj,
-                 scalar_t **ew, const char *filename) {
-  std::string strfilename(filename);
-  if (endswith(strfilename, ".mtx") || endswith(strfilename, ".mm")) {
-    read_mtx(filename, nv, ne, xadj, adj, ew, false, false, false);
-  }
-
-  else if (endswith(strfilename, ".bin")) {
-    read_graph_bin(nv, ne, xadj, adj, ew, filename);
-  }
-
-  else if (endswith(strfilename, ".crs")) {
-    read_graph_crs(nv, ne, xadj, adj, ew, filename);
-  }
-
-  else {
-    throw std::runtime_error("Reader is not available\n");
-  }
-}
-
-template <typename crsMat_t>
-crsMat_t read_kokkos_crst_matrix(const char *filename_) {
-  std::string strfilename(filename_);
-  bool isMatrixMarket =
-      endswith(strfilename, ".mtx") || endswith(strfilename, ".mm");
-
-  typedef typename crsMat_t::StaticCrsGraphType graph_t;
-  typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
-  typedef typename graph_t::entries_type::non_const_type cols_view_t;
-  typedef typename crsMat_t::values_type::non_const_type values_view_t;
-
-  typedef typename row_map_view_t::value_type size_type;
-  typedef typename cols_view_t::value_type lno_t;
-  typedef typename values_view_t::value_type scalar_t;
-
-  lno_t nr, nc, *adj;
-  size_type *xadj, nnzA;
-  scalar_t *values;
-
-  if (isMatrixMarket) {
-    // MatrixMarket file contains the exact number of columns
-    read_mtx<lno_t, size_type, scalar_t>(filename_, &nr, &nc, &nnzA, &xadj,
-                                         &adj, &values, false, false, false);
-  } else {
-    //.crs and .bin files don't contain #cols, so will compute it later based on
-    // the entries
-    read_matrix<lno_t, size_type, scalar_t>(&nr, &nnzA, &xadj, &adj, &values,
-                                            filename_);
-  }
-
-  row_map_view_t rowmap_view("rowmap_view", nr + 1);
-  cols_view_t columns_view("colsmap_view", nnzA);
-  values_view_t values_view("values_view", nnzA);
-
-  {
-    Kokkos::View<size_type *, Kokkos::HostSpace,
-                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-        hr(xadj, nr + 1);
-    Kokkos::View<lno_t *, Kokkos::HostSpace,
-                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-        hc(adj, nnzA);
-    Kokkos::View<scalar_t *, Kokkos::HostSpace,
-                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-        hv(values, nnzA);
-    Kokkos::deep_copy(rowmap_view, hr);
-    Kokkos::deep_copy(columns_view, hc);
-    Kokkos::deep_copy(values_view, hv);
-  }
-
-  if (!isMatrixMarket) {
-    KokkosKernels::Impl::kk_view_reduce_max<cols_view_t,
-                                            typename crsMat_t::execution_space>(
-        nnzA, columns_view, nc);
-    nc++;
-  }
-
-  graph_t static_graph(columns_view, rowmap_view);
-  crsMat_t crsmat("CrsMatrix", nc, values_view, static_graph);
-  delete[] xadj;
-  delete[] adj;
-  delete[] values;
-  return crsmat;
-}
-
-template <typename crsGraph_t>
-crsGraph_t read_kokkos_crst_graph(const char *filename_) {
-  typedef typename crsGraph_t::row_map_type::non_const_type row_map_view_t;
-  typedef typename crsGraph_t::entries_type::non_const_type cols_view_t;
-
-  typedef typename row_map_view_t::value_type size_type;
-  typedef typename cols_view_t::value_type lno_t;
-  typedef double scalar_t;
-
-  lno_t nv, *adj;
-  size_type *xadj, nnzA;
-  scalar_t *values;
-  read_matrix<lno_t, size_type, scalar_t>(&nv, &nnzA, &xadj, &adj, &values,
-                                          filename_);
-
-  row_map_view_t rowmap_view("rowmap_view", nv + 1);
-  cols_view_t columns_view("colsmap_view", nnzA);
-
-  {
-    typename row_map_view_t::HostMirror hr =
-        Kokkos::create_mirror_view(rowmap_view);
-    typename cols_view_t::HostMirror hc =
-        Kokkos::create_mirror_view(columns_view);
-
-    for (lno_t i = 0; i <= nv; ++i) {
-      hr(i) = xadj[i];
-    }
-
-    for (size_type i = 0; i < nnzA; ++i) {
-      hc(i) = adj[i];
-    }
-    Kokkos::deep_copy(rowmap_view, hr);
-    Kokkos::deep_copy(columns_view, hc);
-  }
-
-  lno_t ncols = 0;
-  KokkosKernels::Impl::kk_view_reduce_max<cols_view_t,
-                                          typename crsGraph_t::execution_space>(
-      nnzA, columns_view, ncols);
-  ncols += 1;
-
-  crsGraph_t static_graph(columns_view, rowmap_view, ncols);
-  delete[] xadj;
-  delete[] adj;
-  delete[] values;
-  return static_graph;
-}
-
-template <typename size_type, typename nnz_lno_t>
-inline void kk_sequential_create_incidence_matrix(
-    nnz_lno_t num_rows, const size_type *xadj, const nnz_lno_t *adj,
-    size_type *i_adj  // output. preallocated
-) {
-  std::vector<size_type> c_xadj(num_rows);
-  for (nnz_lno_t i = 0; i < num_rows; i++) {
-    c_xadj[i] = xadj[i];
-  }
-  int eCnt = 0;
-  for (nnz_lno_t i = 0; i < num_rows; i++) {
-    size_type begin   = xadj[i];
-    size_type end     = xadj[i + 1];
-    nnz_lno_t adjsize = end - begin;
-
-    for (nnz_lno_t j = 0; j < adjsize; j++) {
-      size_type aind = j + begin;
-      nnz_lno_t col  = adj[aind];
-      if (i < col) {
-        i_adj[c_xadj[i]++]   = eCnt;
-        i_adj[c_xadj[col]++] = eCnt++;
-      }
-    }
-  }
-
-  for (nnz_lno_t i = 0; i < num_rows; i++) {
-    if (c_xadj[i] != xadj[i + 1]) {
-      std::cout << "i:" << i << " c_xadj[i]:" << c_xadj[i]
-                << " xadj[i+1]:" << xadj[i + 1] << std::endl;
-    }
-  }
-}
-
-template <typename size_type, typename nnz_lno_t>
-inline void kk_sequential_create_incidence_matrix_transpose(
-    const nnz_lno_t num_rows, const size_type num_edges, const size_type *xadj,
-    const nnz_lno_t *adj,
-    size_type *i_xadj,  // output. preallocated
-    nnz_lno_t *i_adj    // output. preallocated
-) {
-  for (nnz_lno_t i = 0; i < num_edges / 2 + 1; i++) {
-    i_xadj[i] = i * 2;
-  }
-  int eCnt = 0;
-  for (nnz_lno_t i = 0; i < num_rows; i++) {
-    size_type begin   = xadj[i];
-    size_type end     = xadj[i + 1];
-    nnz_lno_t adjsize = end - begin;
-
-    for (nnz_lno_t j = 0; j < adjsize; j++) {
-      size_type aind = j + begin;
-      nnz_lno_t col  = adj[aind];
-      if (i < col) {
-        i_adj[eCnt++] = i;
-        i_adj[eCnt++] = col;
-      }
-    }
-  }
-}
-
 }  // namespace Impl
 }  // namespace KokkosKernels
 
diff --git a/src/common/KokkosKernels_SimpleUtils.hpp b/src/common/KokkosKernels_SimpleUtils.hpp
index c1f68ebd3b..bb2a6d43b9 100644
--- a/src/common/KokkosKernels_SimpleUtils.hpp
+++ b/src/common/KokkosKernels_SimpleUtils.hpp
@@ -346,7 +346,7 @@ struct ReduceMaxFunctor {
     }
   }
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type &dst, const volatile value_type &src) const {
+  void join(value_type &dst, const value_type &src) const {
     if (dst < src) {
       dst = src;
     }
diff --git a/src/common/KokkosKernels_Sorting.hpp b/src/common/KokkosKernels_Sorting.hpp
index 1cdf1df7ee..8b897047d9 100644
--- a/src/common/KokkosKernels_Sorting.hpp
+++ b/src/common/KokkosKernels_Sorting.hpp
@@ -61,41 +61,6 @@ struct DefaultComparator {
 };
 }  // namespace Impl
 
-// ----------------------------------
-// CRS matrix/graph sorting utilities
-// ----------------------------------
-
-// The sort_crs* functions sort the adjacent column list for each row into
-// ascending order.
-
-template <typename execution_space, typename rowmap_t, typename entries_t,
-          typename values_t>
-void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries,
-                     const values_t& values);
-
-template <typename crsMat_t>
-void sort_crs_matrix(const crsMat_t& A);
-
-template <typename execution_space, typename rowmap_t, typename entries_t>
-void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries);
-
-template <typename crsGraph_t>
-void sort_crs_graph(const crsGraph_t& G);
-
-// sort_and_merge_matrix produces a new matrix which is equivalent to A but is
-// sorted and has no duplicated entries: each (i, j) is unique. Values for
-// duplicated entries are summed.
-template <typename crsMat_t>
-crsMat_t sort_and_merge_matrix(const crsMat_t& A);
-
-template <typename crsGraph_t>
-crsGraph_t sort_and_merge_graph(const crsGraph_t& G);
-
-template <typename exec_space, typename rowmap_t, typename entries_t>
-void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in,
-                          const entries_t& entries_in, rowmap_t& rowmap_out,
-                          entries_t& entries_out);
-
 // ----------------------------
 // General device-level sorting
 // ----------------------------
@@ -148,240 +113,6 @@ KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(
 
 namespace Impl {
 
-template <typename execution_space, typename rowmap_t, typename entries_t,
-          typename values_t>
-struct SortCrsMatrixFunctor {
-  using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t     = typename entries_t::non_const_value_type;
-  using scalar_t  = typename values_t::non_const_value_type;
-  using team_mem  = typename Kokkos::TeamPolicy<execution_space>::member_type;
-  // The functor owns memory for entriesAux, so it can't have
-  // MemoryTraits<Unmanaged>
-  using entries_managed_t = Kokkos::View<typename entries_t::data_type,
-                                         typename entries_t::device_type>;
-  using values_managed_t  = Kokkos::View<typename values_t::data_type,
-                                        typename values_t::device_type>;
-
-  SortCrsMatrixFunctor(bool usingRangePol, const rowmap_t& rowmap_,
-                       const entries_t& entries_, const values_t& values_)
-      : rowmap(rowmap_), entries(entries_), values(values_) {
-    if (usingRangePol) {
-      entriesAux = entries_managed_t(
-          Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"),
-          entries.extent(0));
-      valuesAux = values_managed_t(
-          Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values aux"),
-          values.extent(0));
-    }
-    // otherwise, aux arrays won't be allocated (sorting in place)
-  }
-
-  KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const {
-    size_type rowStart = rowmap(i);
-    size_type rowEnd   = rowmap(i + 1);
-    lno_t rowNum       = rowEnd - rowStart;
-    // Radix sort requires unsigned keys for comparison
-    using unsigned_lno_t = typename std::make_unsigned<lno_t>::type;
-    KokkosKernels::SerialRadixSort2<lno_t, unsigned_lno_t, scalar_t>(
-        (unsigned_lno_t*)entries.data() + rowStart,
-        (unsigned_lno_t*)entriesAux.data() + rowStart, values.data() + rowStart,
-        valuesAux.data() + rowStart, rowNum);
-  }
-
-  KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const {
-    size_type i        = t.league_rank();
-    size_type rowStart = rowmap(i);
-    size_type rowEnd   = rowmap(i + 1);
-    lno_t rowNum       = rowEnd - rowStart;
-    KokkosKernels::TeamBitonicSort2<lno_t, lno_t, scalar_t, team_mem>(
-        entries.data() + rowStart, values.data() + rowStart, rowNum, t);
-  }
-
-  rowmap_t rowmap;
-  entries_t entries;
-  entries_managed_t entriesAux;
-  values_t values;
-  values_managed_t valuesAux;
-};
-
-template <typename execution_space, typename rowmap_t, typename entries_t>
-struct SortCrsGraphFunctor {
-  using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t     = typename entries_t::non_const_value_type;
-  using team_mem  = typename Kokkos::TeamPolicy<execution_space>::member_type;
-  // The functor owns memory for entriesAux, so it can't have
-  // MemoryTraits<Unmanaged>
-  using entries_managed_t = Kokkos::View<typename entries_t::data_type,
-                                         typename entries_t::device_type>;
-
-  SortCrsGraphFunctor(bool usingRangePol, const rowmap_t& rowmap_,
-                      const entries_t& entries_)
-      : rowmap(rowmap_), entries(entries_) {
-    if (usingRangePol) {
-      entriesAux = entries_managed_t(
-          Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"),
-          entries.extent(0));
-    }
-    // otherwise, aux arrays won't be allocated (sorting in place)
-  }
-
-  KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const {
-    size_type rowStart = rowmap(i);
-    size_type rowEnd   = rowmap(i + 1);
-    lno_t rowNum       = rowEnd - rowStart;
-    // Radix sort requires unsigned keys for comparison
-    using unsigned_lno_t = typename std::make_unsigned<lno_t>::type;
-    KokkosKernels::SerialRadixSort<lno_t, unsigned_lno_t>(
-        (unsigned_lno_t*)entries.data() + rowStart,
-        (unsigned_lno_t*)entriesAux.data() + rowStart, rowNum);
-  }
-
-  KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const {
-    size_type i        = t.league_rank();
-    size_type rowStart = rowmap(i);
-    size_type rowEnd   = rowmap(i + 1);
-    lno_t rowNum       = rowEnd - rowStart;
-    KokkosKernels::TeamBitonicSort<lno_t, lno_t, team_mem>(
-        entries.data() + rowStart, rowNum, t);
-  }
-
-  rowmap_t rowmap;
-  entries_t entries;
-  entries_managed_t entriesAux;
-};
-
-template <typename rowmap_t, typename entries_t>
-struct MergedRowmapFunctor {
-  using size_type  = typename rowmap_t::non_const_value_type;
-  using lno_t      = typename entries_t::non_const_value_type;
-  using c_rowmap_t = typename rowmap_t::const_type;
-
-  // Precondition: entries are sorted within each row
-  MergedRowmapFunctor(const rowmap_t& mergedCounts_, const c_rowmap_t& rowmap_,
-                      const entries_t& entries_)
-      : mergedCounts(mergedCounts_), rowmap(rowmap_), entries(entries_) {}
-
-  KOKKOS_INLINE_FUNCTION void operator()(lno_t row, size_type& lnewNNZ) const {
-    size_type rowBegin = rowmap(row);
-    size_type rowEnd   = rowmap(row + 1);
-    if (rowEnd == rowBegin) {
-      // Row was empty to begin with
-      mergedCounts(row) = 0;
-      return;
-    }
-    // Otherwise, the first entry in the row exists
-    lno_t uniqueEntries = 1;
-    for (size_type j = rowBegin + 1; j < rowEnd; j++) {
-      if (entries(j - 1) != entries(j)) uniqueEntries++;
-    }
-    mergedCounts(row) = uniqueEntries;
-    lnewNNZ += uniqueEntries;
-    if (row == lno_t((rowmap.extent(0) - 1) - 1)) mergedCounts(row + 1) = 0;
-  }
-
-  rowmap_t mergedCounts;
-  c_rowmap_t rowmap;
-  entries_t entries;
-};
-
-template <typename rowmap_t, typename entries_t, typename values_t>
-struct MatrixMergedEntriesFunctor {
-  using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t     = typename entries_t::non_const_value_type;
-  using scalar_t  = typename values_t::non_const_value_type;
-
-  // Precondition: entries are sorted within each row
-  MatrixMergedEntriesFunctor(const rowmap_t& rowmap_, const entries_t& entries_,
-                             const values_t& values_,
-                             const rowmap_t& mergedRowmap_,
-                             const entries_t& mergedEntries_,
-                             const values_t& mergedValues_)
-      : rowmap(rowmap_),
-        entries(entries_),
-        values(values_),
-        mergedRowmap(mergedRowmap_),
-        mergedEntries(mergedEntries_),
-        mergedValues(mergedValues_) {}
-
-  KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const {
-    size_type rowBegin = rowmap(row);
-    size_type rowEnd   = rowmap(row + 1);
-    if (rowEnd == rowBegin) {
-      // Row was empty to begin with, nothing to do
-      return;
-    }
-    // Otherwise, accumulate the value for each column
-    scalar_t accumVal   = values(rowBegin);
-    lno_t accumCol      = entries(rowBegin);
-    size_type insertPos = mergedRowmap(row);
-    for (size_type j = rowBegin + 1; j < rowEnd; j++) {
-      if (accumCol == entries(j)) {
-        // accumulate
-        accumVal += values(j);
-      } else {
-        // write out and reset
-        mergedValues(insertPos)  = accumVal;
-        mergedEntries(insertPos) = accumCol;
-        insertPos++;
-        accumVal = values(j);
-        accumCol = entries(j);
-      }
-    }
-    // always left with the last unique entry
-    mergedValues(insertPos)  = accumVal;
-    mergedEntries(insertPos) = accumCol;
-  }
-
-  rowmap_t rowmap;
-  entries_t entries;
-  values_t values;
-  rowmap_t mergedRowmap;
-  entries_t mergedEntries;
-  values_t mergedValues;
-};
-
-template <typename rowmap_t, typename entries_t>
-struct GraphMergedEntriesFunctor {
-  using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t     = typename entries_t::non_const_value_type;
-
-  // Precondition: entries are sorted within each row
-  GraphMergedEntriesFunctor(const rowmap_t& rowmap_, const entries_t& entries_,
-                            const rowmap_t& mergedRowmap_,
-                            const entries_t& mergedEntries_)
-      : rowmap(rowmap_),
-        entries(entries_),
-        mergedRowmap(mergedRowmap_),
-        mergedEntries(mergedEntries_) {}
-
-  KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const {
-    size_type rowBegin = rowmap(row);
-    size_type rowEnd   = rowmap(row + 1);
-    if (rowEnd == rowBegin) {
-      // Row was empty to begin with, nothing to do
-      return;
-    }
-    // Otherwise, accumulate the value for each column
-    lno_t accumCol      = entries(rowBegin);
-    size_type insertPos = mergedRowmap(row);
-    for (size_type j = rowBegin + 1; j < rowEnd; j++) {
-      if (accumCol != entries(j)) {
-        // write out and reset
-        mergedEntries(insertPos) = accumCol;
-        insertPos++;
-        accumCol = entries(j);
-      }
-    }
-    // always left with the last unique entry
-    mergedEntries(insertPos) = accumCol;
-  }
-
-  rowmap_t rowmap;
-  entries_t entries;
-  rowmap_t mergedRowmap;
-  entries_t mergedEntries;
-};
-
 // Functor that sorts a view on one team
 template <typename View, typename Ordinal, typename TeamMember,
           typename Comparator>
@@ -517,188 +248,6 @@ struct BitonicPhase2Functor {
 
 }  // namespace Impl
 
-// Sort a CRS matrix: within each row, sort entries ascending by column.
-// At the same time, permute the values.
-template <typename execution_space, typename rowmap_t, typename entries_t,
-          typename values_t>
-void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries,
-                     const values_t& values) {
-  using lno_t    = typename entries_t::non_const_value_type;
-  using team_pol = Kokkos::TeamPolicy<execution_space>;
-  bool useRadix  = !Impl::kk_is_gpu_exec_space<execution_space>();
-  lno_t numRows  = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
-  if (numRows == 0) return;
-  Impl::SortCrsMatrixFunctor<execution_space, rowmap_t, entries_t, values_t>
-      funct(useRadix, rowmap, entries, values);
-  if (useRadix) {
-    Kokkos::parallel_for("sort_crs_matrix",
-                         Kokkos::RangePolicy<execution_space>(0, numRows),
-                         funct);
-  } else {
-    // Try to get teamsize to be largest power of 2 not greater than avg entries
-    // per row
-    // TODO (probably important for performnce): add thread-level sort also, and
-    // use that for small avg degree. But this works for now.
-    lno_t idealTeamSize = 1;
-    lno_t avgDeg        = (entries.extent(0) + numRows - 1) / numRows;
-    while (idealTeamSize < avgDeg / 2) {
-      idealTeamSize *= 2;
-    }
-    team_pol temp(numRows, 1);
-    lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag());
-    lno_t teamSize    = std::min(idealTeamSize, maxTeamSize);
-    Kokkos::parallel_for("sort_crs_matrix", team_pol(numRows, teamSize), funct);
-  }
-}
-
-template <typename crsMat_t>
-void sort_crs_matrix(const crsMat_t& A) {
-  // Note: rowmap_t has const values, but that's OK as sorting doesn't modify it
-  using rowmap_t   = typename crsMat_t::row_map_type;
-  using entries_t  = typename crsMat_t::index_type::non_const_type;
-  using values_t   = typename crsMat_t::values_type::non_const_type;
-  using exec_space = typename crsMat_t::execution_space;
-  // NOTE: the rowmap of a StaticCrsGraph is const-valued, but the
-  // entries and CrsMatrix values are non-const (so sorting them directly
-  // is allowed)
-  sort_crs_matrix<exec_space, rowmap_t, entries_t, values_t>(
-      A.graph.row_map, A.graph.entries, A.values);
-}
-
-// Sort a CRS graph: within each row, sort entries ascending by column.
-template <typename execution_space, typename rowmap_t, typename entries_t>
-void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) {
-  using lno_t    = typename entries_t::non_const_value_type;
-  using team_pol = Kokkos::TeamPolicy<execution_space>;
-  bool useRadix  = !Impl::kk_is_gpu_exec_space<execution_space>();
-  lno_t numRows  = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
-  if (numRows == 0) return;
-  Impl::SortCrsGraphFunctor<execution_space, rowmap_t, entries_t> funct(
-      useRadix, rowmap, entries);
-  if (useRadix) {
-    Kokkos::parallel_for("sort_crs_graph",
-                         Kokkos::RangePolicy<execution_space>(0, numRows),
-                         funct);
-  } else {
-    // Try to get teamsize to be largest power of 2 less than or equal to
-    // half the entries per row. 0.5 * #entries is bitonic's parallelism within
-    // a row.
-    // TODO (probably important for performnce): add thread-level sort also, and
-    // use that for small avg degree. But this works for now.
-    lno_t idealTeamSize = 1;
-    lno_t avgDeg        = (entries.extent(0) + numRows - 1) / numRows;
-    while (idealTeamSize < avgDeg / 2) {
-      idealTeamSize *= 2;
-    }
-    team_pol temp(numRows, 1);
-    lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag());
-    lno_t teamSize    = std::min(idealTeamSize, maxTeamSize);
-    Kokkos::parallel_for("sort_crs_graph", team_pol(numRows, teamSize), funct);
-  }
-}
-
-template <typename crsGraph_t>
-void sort_crs_graph(const crsGraph_t& G) {
-  static_assert(
-      !std::is_const<typename crsGraph_t::entries_type::value_type>::value,
-      "sort_crs_graph requires StaticCrsGraph entries to be non-const.");
-  sort_crs_graph<typename crsGraph_t::execution_space,
-                 typename crsGraph_t::row_map_type,
-                 typename crsGraph_t::entries_type>(G.row_map, G.entries);
-}
-
-// Sort the rows of matrix, and merge duplicate entries.
-template <typename crsMat_t>
-crsMat_t sort_and_merge_matrix(const crsMat_t& A) {
-  using c_rowmap_t = typename crsMat_t::row_map_type;
-  using rowmap_t   = typename crsMat_t::row_map_type::non_const_type;
-  using entries_t  = typename crsMat_t::index_type::non_const_type;
-  using values_t   = typename crsMat_t::values_type::non_const_type;
-  using size_type  = typename rowmap_t::non_const_value_type;
-  using exec_space = typename crsMat_t::execution_space;
-  using range_t    = Kokkos::RangePolicy<exec_space>;
-  sort_crs_matrix(A);
-  // Count entries per row into a new rowmap, in terms of merges that can be
-  // done
-  rowmap_t mergedRowmap(
-      Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"),
-      A.numRows() + 1);
-  size_type numCompressedEntries = 0;
-  Kokkos::parallel_reduce(range_t(0, A.numRows()),
-                          Impl::MergedRowmapFunctor<rowmap_t, entries_t>(
-                              mergedRowmap, A.graph.row_map, A.graph.entries),
-                          numCompressedEntries);
-  // Prefix sum to get rowmap
-  Impl::kk_exclusive_parallel_prefix_sum<rowmap_t, exec_space>(A.numRows() + 1,
-                                                               mergedRowmap);
-  entries_t mergedEntries("SortedMerged entries", numCompressedEntries);
-  values_t mergedValues("SortedMerged values", numCompressedEntries);
-  // Compute merged entries and values
-  Kokkos::parallel_for(
-      range_t(0, A.numRows()),
-      Impl::MatrixMergedEntriesFunctor<c_rowmap_t, entries_t, values_t>(
-          A.graph.row_map, A.graph.entries, A.values, mergedRowmap,
-          mergedEntries, mergedValues));
-  // Finally, construct the new compressed matrix
-  return crsMat_t("SortedMerged", A.numRows(), A.numCols(),
-                  numCompressedEntries, mergedValues, mergedRowmap,
-                  mergedEntries);
-}
-
-template <typename exec_space, typename rowmap_t, typename entries_t>
-void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in,
-                          const entries_t& entries_in, rowmap_t& rowmap_out,
-                          entries_t& entries_out) {
-  using size_type      = typename rowmap_t::non_const_value_type;
-  using lno_t          = typename entries_t::non_const_value_type;
-  using range_t        = Kokkos::RangePolicy<exec_space>;
-  using const_rowmap_t = typename rowmap_t::const_type;
-  lno_t numRows        = rowmap_in.extent(0);
-  if (numRows <= 1) {
-    // Matrix has zero rows
-    rowmap_out  = rowmap_t();
-    entries_out = entries_t();
-    return;
-  }
-  numRows--;
-  // Sort in place
-  sort_crs_graph<exec_space, const_rowmap_t, entries_t>(rowmap_in, entries_in);
-  // Count entries per row into a new rowmap, in terms of merges that can be
-  // done
-  rowmap_out = rowmap_t(
-      Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"),
-      numRows + 1);
-  size_type numCompressedEntries = 0;
-  Kokkos::parallel_reduce(range_t(0, numRows),
-                          Impl::MergedRowmapFunctor<rowmap_t, entries_t>(
-                              rowmap_out, rowmap_in, entries_in),
-                          numCompressedEntries);
-  // Prefix sum to get rowmap
-  Impl::kk_exclusive_parallel_prefix_sum<rowmap_t, exec_space>(numRows + 1,
-                                                               rowmap_out);
-  entries_out = entries_t("SortedMerged entries", numCompressedEntries);
-  // Compute merged entries and values
-  Kokkos::parallel_for(
-      range_t(0, numRows),
-      Impl::GraphMergedEntriesFunctor<const_rowmap_t, entries_t>(
-          rowmap_in, entries_in, rowmap_out, entries_out));
-}
-
-template <typename crsGraph_t>
-crsGraph_t sort_and_merge_graph(const crsGraph_t& G) {
-  using rowmap_t  = typename crsGraph_t::row_map_type::non_const_type;
-  using entries_t = typename crsGraph_t::entries_type;
-  static_assert(
-      !std::is_const<typename entries_t::value_type>::value,
-      "sort_and_merge_graph requires StaticCrsGraph entries to be non-const.");
-  rowmap_t mergedRowmap;
-  entries_t mergedEntries;
-  sort_and_merge_graph<typename crsGraph_t::execution_space, rowmap_t,
-                       entries_t>(G.row_map, G.entries, mergedRowmap,
-                                  mergedEntries);
-  return crsGraph_t(mergedEntries, mergedRowmap);
-}
-
 // Version to be called from host on a single array
 // Generally ~2x slower than Kokkos::sort() for large arrays (> 50 M elements),
 // but faster for smaller arrays.
@@ -1032,39 +581,6 @@ KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm,
 // For backward compatibility: keep the public interface accessible in
 // KokkosKernels::Impl::
 namespace Impl {
-template <typename execution_space, typename rowmap_t, typename entries_t>
-[[deprecated]] void sort_crs_graph(const rowmap_t& rowmap,
-                                   const entries_t& entries) {
-  KokkosKernels::sort_crs_graph<execution_space, rowmap_t, entries_t>(rowmap,
-                                                                      entries);
-}
-
-template <typename execution_space, typename rowmap_t, typename entries_t,
-          typename values_t>
-[[deprecated]] void sort_crs_matrix(const rowmap_t& rowmap,
-                                    const entries_t& entries,
-                                    const values_t& values) {
-  KokkosKernels::sort_crs_matrix<execution_space, rowmap_t, entries_t,
-                                 values_t>(rowmap, entries, values);
-}
-
-template <typename crsMat_t>
-[[deprecated]] void sort_crs_matrix(const crsMat_t& A) {
-  KokkosKernels::sort_crs_matrix(A);
-}
-
-template <typename exec_space, typename rowmap_t, typename entries_t>
-[[deprecated]] void sort_and_merge_graph(
-    const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in,
-    rowmap_t& rowmap_out, entries_t& entries_out) {
-  KokkosKernels::sort_and_merge_graph<exec_space, rowmap_t, entries_t>(
-      rowmap_in, entries_in, rowmap_out, entries_out);
-}
-
-template <typename crsMat_t>
-[[deprecated]] crsMat_t sort_and_merge_matrix(const crsMat_t& A) {
-  return KokkosKernels::sort_and_merge_matrix(A);
-}
 
 template <
     typename View, typename ExecSpace, typename Ordinal,
diff --git a/src/common/KokkosKernels_Utils.hpp b/src/common/KokkosKernels_Utils.hpp
index 655d89ba67..eae4080879 100644
--- a/src/common/KokkosKernels_Utils.hpp
+++ b/src/common/KokkosKernels_Utils.hpp
@@ -49,7 +49,6 @@
 
 #include "KokkosKernels_ExecSpaceUtils.hpp"
 #include "KokkosKernels_SimpleUtils.hpp"
-#include "KokkosKernels_SparseUtils.hpp"
 #include "KokkosKernels_PrintUtils.hpp"
 #include "KokkosKernels_VectorUtils.hpp"
 
@@ -516,7 +515,7 @@ struct PropogataMaxValstoZeros {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile idx &update, volatile const idx &input) const {
+  void join(idx &update, const idx &input) const {
     if (input > update) update = input;
   }
 };
@@ -1261,7 +1260,7 @@ struct ReduceRowSizeFunctor {
     }
   }
   KOKKOS_INLINE_FUNCTION
-  void join(volatile size_type &dst, const volatile size_type &src) const {
+  void join(size_type &dst, const size_type &src) const {
     if (dst < src) {
       dst = src;
     }
@@ -1306,7 +1305,7 @@ struct ReduceMaxRowFunctor {
     }
   }
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type &dst, const volatile value_type &src) const {
+  void join(value_type &dst, const value_type &src) const {
     if (dst < src) {
       dst = src;
     }
@@ -1351,9 +1350,7 @@ struct IsEqualFunctor {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile int &dst, const volatile int &src) const {
-    dst = dst & src;
-  }
+  void join(int &dst, const int &src) const { dst = dst & src; }
   KOKKOS_INLINE_FUNCTION
   void init(int &dst) const { dst = 1; }
 };
@@ -1467,11 +1464,6 @@ struct array_sum_reduce {
     for (int i = 0; i < N; i++) data[i] += src.data[i];
     return *this;
   }
-  KOKKOS_INLINE_FUNCTION  // volatile add operator
-      void
-      operator+=(const volatile ValueType &src) volatile {
-    for (int i = 0; i < N; i++) data[i] += src.data[i];
-  }
 };
 
 template <typename InPtr, typename T>
diff --git a/src/common/KokkosKernels_default_types.hpp b/src/common/KokkosKernels_default_types.hpp
index 4012b2e158..d70a6b27ac 100644
--- a/src/common/KokkosKernels_default_types.hpp
+++ b/src/common/KokkosKernels_default_types.hpp
@@ -79,6 +79,8 @@ using default_scalar = double;
 using default_scalar    = float;
 #elif defined(KOKKOSKERNELS_INST_HALF)
 using default_scalar    = Kokkos::Experimental::half_t;
+#elif defined(KOKKOSKERNELS_INST_BHALF)
+using default_scalar = Kokkos::Experimental::bhalf_t;
 #else
 using default_scalar = double;
 #endif
diff --git a/src/common/Kokkos_ArithTraits.hpp b/src/common/Kokkos_ArithTraits.hpp
new file mode 100644
index 0000000000..108e845694
--- /dev/null
+++ b/src/common/Kokkos_ArithTraits.hpp
@@ -0,0 +1,2083 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ARITHTRAITS_HPP
+#define KOKKOS_ARITHTRAITS_HPP
+
+/// \file Kokkos_ArithTraits.hpp
+/// \brief Declaration and definition of Kokkos::Details::ArithTraits
+
+#include <KokkosKernels_config.h>
+#include <Kokkos_NumericTraits.hpp>
+#include <Kokkos_MathematicalFunctions.hpp>
+#include <Kokkos_Complex.hpp>
+#include <Kokkos_Macros.hpp>
+#include <KokkosKernels_Half.hpp>
+
+#include <impl/Kokkos_QuadPrecisionMath.hpp>
+
+#include <cfloat>
+#include <climits>
+#include <cmath>
+#include <complex>  // std::complex
+#include <limits>   // std::numeric_limits
+#ifdef __CUDACC__
+#include <math_constants.h>
+#endif
+
+namespace {  // anonymous
+
+/// \fn intPowImpl
+/// \tparam IntType A built-in integer type.
+/// \brief Implementation of intPowSigned and intPowUnsigned.
+///
+/// \pre x != 0
+/// \pre y > 0
+///
+/// Use intPowSigned or intPowUnsigned for general y.
+template <class IntType>
+KOKKOS_FORCEINLINE_FUNCTION IntType intPowImpl(const IntType x,
+                                               const IntType y) {
+  // Recursion (unrolled into while loop): pow(x, 2y) = (x^y)^2
+  IntType prod  = x;
+  IntType y_cur = 1;
+  // If y == 1, then prod stays x.
+  while (y_cur < y) {
+    prod  = prod * prod;
+    y_cur = y_cur << 1;
+  }
+  // abs(y - y_cur) < floor(log2(y)), so it won't hurt asymptotic run
+  // time to finish the remainder in a linear iteration.
+  if (y > y_cur) {
+    const IntType left = y - y_cur;
+    for (IntType k = 0; k < left; ++k) {
+      prod = prod * x;
+    }
+  } else if (y < y_cur) {
+    // There's probably a better way to do this in order to avoid the
+    // (expensive) integer division, but I'm not motivated to think of
+    // it at the moment.
+    const IntType left = y_cur - y;
+    for (IntType k = 0; k < left; ++k) {
+      prod = prod / x;
+    }
+  }
+  return prod;
+
+  // y = 8:
+  //
+  // x,1   -> x^2,2
+  // x^2,2 -> x^4,4
+  // x^4,4 -> x^8,8
+  //
+  // y = 9:
+  //
+  // x,1   -> x^2,2
+  // x^2,2 -> x^4,4
+  // x^4,4 -> x^8,8
+  //
+  // y - y_cur is what's left over.  Just do it one at a time.
+  //
+  // y = 3:
+  // x,1   -> x^2,2
+  // x^2,2 -> x^4,4
+}
+
+// Warning free abs function for types where we don't know whether they are
+// signed (like char)
+template <class T, bool is_signed = std::numeric_limits<T>::is_signed>
+struct integer_abs {
+  static KOKKOS_INLINE_FUNCTION T abs(const T& val);
+};
+
+template <class T>
+struct integer_abs<T, true> {
+  static KOKKOS_INLINE_FUNCTION T abs(const T& x) { return x < 0 ? -x : x; }
+};
+
+template <class T>
+struct integer_abs<T, false> {
+  static KOKKOS_INLINE_FUNCTION T abs(const T& x) { return x; }
+};
+
+/// \fn intPowSigned
+/// \tparam IntType A built-in signed integer type.
+/// \brief Compute x raised to the power y.
+///
+/// If the arguments are invalid (e.g., if x and y are both zero), the
+/// result of this function is undefined.  However, this function will
+/// not throw an exception in that case.
+template <class IntType>
+KOKKOS_FORCEINLINE_FUNCTION
+    typename std::enable_if<std::numeric_limits<IntType>::is_signed,
+                            IntType>::type
+    intPowSigned(const IntType x, const IntType y) {
+  // It's not entirely clear what to return if x and y are both zero.
+  // In the case of floating-point numbers, 0^0 is NaN.  Here, though,
+  // I think it's safe to return 0.
+  if (x == 0) {
+    return 0;
+  } else if (y == 0) {
+    return 1;
+  } else if (y < 0) {
+    if (x == 1) {
+      return 1;
+    } else if (x == -1) {
+      return (y % 2 == 0) ? 1 : -1;
+    } else {
+      return 0;  // round the fraction to zero
+    }
+  }
+  return intPowImpl<IntType>(x, y);
+}
+template <class IntType>
+KOKKOS_FORCEINLINE_FUNCTION
+    typename std::enable_if<!std::numeric_limits<IntType>::is_signed,
+                            IntType>::type
+    intPowSigned(const IntType x, const IntType y) {
+  // It's not entirely clear what to return if x and y are both zero.
+  // In the case of floating-point numbers, 0^0 is NaN.  Here, though,
+  // I think it's safe to return 0.
+  if (x == 0) {
+    return 0;
+  } else if (y == 0) {
+    return 1;
+  }
+  return intPowImpl<IntType>(x, y);
+}
+
+/// \fn intPowUnsigned
+/// \tparam IntType A built-in unsigned integer type.
+/// \brief Compute x raised to the power y.
+///
+/// If the arguments are invalid (e.g., if x and y are both zero), the
+/// result of this function is undefined.  However, this function will
+/// not throw an exception in that case.
+template <class IntType>
+KOKKOS_FORCEINLINE_FUNCTION IntType intPowUnsigned(const IntType x,
+                                                   const IntType y) {
+  // It's not entirely clear what to return if x and y are both zero.
+  // In the case of floating-point numbers, 0^0 is NaN.  Here, though,
+  // I think it's safe to return 0.
+  if (x == 0) {
+    return 0;
+  } else if (y == 0) {
+    return 1;
+  } else {
+    return intPowImpl<IntType>(x, y);
+  }
+}
+
+// It might make sense to use special sqrt() approximations for
+// integer arguments, like those presented on the following web site:
+//
+// http://www.azillionmonkeys.com/qed/sqroot.html#implementations
+//
+// Note that some of the implementations on the above page break ANSI
+// C(++) aliasing rules (by assigning to the results of
+// reinterpret_cast-ing between int and float).  It's also just a
+// performance optimization and not required for a reasonable
+// implementation.
+
+}  // namespace
+
+namespace Kokkos {
+namespace Details {
+
+// Macro to automate the wrapping of Kokkos Mathematical Functions
+// in the ArithTraits struct for real floating point types, hopefully
+// this can be expanded to Kokkos::half_t and Kokkos::bhalf_t
+#define KOKKOSKERNELS_ARITHTRAITS_REAL_FP(FUNC_QUAL)                           \
+  static FUNC_QUAL val_type zero() { return static_cast<val_type>(0); }        \
+  static FUNC_QUAL val_type one() { return static_cast<val_type>(1); }         \
+  static FUNC_QUAL val_type min() {                                            \
+    return Kokkos::Experimental::finite_min<val_type>::value;                  \
+  }                                                                            \
+  static FUNC_QUAL val_type max() {                                            \
+    return Kokkos::Experimental::finite_max<val_type>::value;                  \
+  }                                                                            \
+  static FUNC_QUAL val_type infinity() {                                       \
+    return Kokkos::Experimental::infinity<val_type>::value;                    \
+  }                                                                            \
+  static FUNC_QUAL val_type nan() {                                            \
+    return Kokkos::Experimental::quiet_NaN<val_type>::value;                   \
+  }                                                                            \
+  static FUNC_QUAL mag_type epsilon() {                                        \
+    return Kokkos::Experimental::epsilon<val_type>::value;                     \
+  }                                                                            \
+  static FUNC_QUAL mag_type sfmin() {                                          \
+    return Kokkos::Experimental::norm_min<val_type>::value;                    \
+  }                                                                            \
+  static FUNC_QUAL int base() {                                                \
+    return Kokkos::Experimental::radix<val_type>::value;                       \
+  }                                                                            \
+  static FUNC_QUAL mag_type prec() {                                           \
+    return epsilon() * static_cast<mag_type>(base());                          \
+  }                                                                            \
+  static FUNC_QUAL int t() {                                                   \
+    return Kokkos::Experimental::digits<val_type>::value;                      \
+  }                                                                            \
+  static FUNC_QUAL mag_type rnd() { return one(); }                            \
+  static FUNC_QUAL int emin() {                                                \
+    return Kokkos::Experimental::min_exponent<val_type>::value;                \
+  }                                                                            \
+  static FUNC_QUAL mag_type rmin() {                                           \
+    return Kokkos::Experimental::norm_min<val_type>::value;                    \
+  }                                                                            \
+  static FUNC_QUAL int emax() {                                                \
+    return Kokkos::Experimental::max_exponent<val_type>::value;                \
+  }                                                                            \
+  static FUNC_QUAL mag_type rmax() {                                           \
+    return Kokkos::Experimental::finite_max<val_type>::value;                  \
+  }                                                                            \
+                                                                               \
+  static FUNC_QUAL bool isInf(const val_type x) { return Kokkos::isinf(x); }   \
+  static FUNC_QUAL bool isNan(const val_type x) { return Kokkos::isnan(x); }   \
+  static FUNC_QUAL mag_type abs(const val_type x) { return Kokkos::abs(x); }   \
+  static FUNC_QUAL mag_type real(const val_type x) { return Kokkos::real(x); } \
+  static FUNC_QUAL mag_type imag(const val_type x) { return Kokkos::imag(x); } \
+  static FUNC_QUAL val_type conj(const val_type x) { return x; }               \
+  static FUNC_QUAL val_type pow(const val_type x, const val_type y) {          \
+    return Kokkos::pow(x, y);                                                  \
+  }                                                                            \
+  static FUNC_QUAL val_type sqrt(const val_type x) { return Kokkos::sqrt(x); } \
+  static FUNC_QUAL val_type cbrt(const val_type x) { return Kokkos::cbrt(x); } \
+  static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); }   \
+  static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); }   \
+  static FUNC_QUAL val_type log10(const val_type x) {                          \
+    return Kokkos::log10(x);                                                   \
+  }                                                                            \
+  static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); }   \
+  static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); }   \
+  static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); }   \
+  static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \
+  static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \
+  static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \
+  static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \
+  static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \
+  static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \
+                                                                               \
+  static FUNC_QUAL bool isnaninf(const val_type x) {                           \
+    return isNan(x) || isInf(x);                                               \
+  }                                                                            \
+  static FUNC_QUAL magnitudeType magnitude(const val_type x) {                 \
+    return abs(x);                                                             \
+  }                                                                            \
+  static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); }    \
+  static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); }   \
+  static FUNC_QUAL mag_type eps() { return epsilon(); }
+
+#define KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(FUNC_QUAL)                          \
+                                                                               \
+  static constexpr bool is_specialized = true;                                 \
+  static constexpr bool is_signed      = true;                                 \
+  static constexpr bool is_integer     = false;                                \
+  static constexpr bool is_exact       = false;                                \
+  static constexpr bool is_complex     = true;                                 \
+  static constexpr bool has_infinity   = true;                                 \
+                                                                               \
+  using magnitudeType = mag_type;                                              \
+  using halfPrecision =                                                        \
+      ::Kokkos::complex<ArithTraits<mag_type>::halfPrecision>;                 \
+  using doublePrecision =                                                      \
+      ::Kokkos::complex<ArithTraits<mag_type>::doublePrecision>;               \
+                                                                               \
+  static constexpr bool isComplex    = true;                                   \
+  static constexpr bool isOrdinal    = false;                                  \
+  static constexpr bool isComparable = false;                                  \
+  static constexpr bool hasMachineParameters =                                 \
+      ArithTraits<mag_type>::hasMachineParameters;                             \
+                                                                               \
+  static FUNC_QUAL val_type zero() {                                           \
+    return val_type(ArithTraits<mag_type>::zero(),                             \
+                    ArithTraits<mag_type>::zero());                            \
+  }                                                                            \
+  static FUNC_QUAL val_type one() {                                            \
+    return val_type(ArithTraits<mag_type>::one(),                              \
+                    ArithTraits<mag_type>::zero());                            \
+  }                                                                            \
+  static FUNC_QUAL val_type min() {                                            \
+    return val_type(ArithTraits<mag_type>::min(),                              \
+                    ArithTraits<mag_type>::min());                             \
+  }                                                                            \
+  static FUNC_QUAL val_type max() {                                            \
+    return val_type(ArithTraits<mag_type>::max(),                              \
+                    ArithTraits<mag_type>::max());                             \
+  }                                                                            \
+  static FUNC_QUAL val_type infinity() {                                       \
+    return val_type(ArithTraits<mag_type>::infinity(),                         \
+                    ArithTraits<mag_type>::infinity());                        \
+  }                                                                            \
+  static FUNC_QUAL val_type nan() {                                            \
+    return val_type(ArithTraits<mag_type>::nan(),                              \
+                    ArithTraits<mag_type>::nan());                             \
+  }                                                                            \
+  static FUNC_QUAL mag_type epsilon() {                                        \
+    return ArithTraits<mag_type>::epsilon();                                   \
+  }                                                                            \
+  static FUNC_QUAL mag_type sfmin() { return ArithTraits<mag_type>::sfmin(); } \
+  static FUNC_QUAL int base() { return ArithTraits<mag_type>::base(); }        \
+  static FUNC_QUAL mag_type prec() { return ArithTraits<mag_type>::prec(); }   \
+  static FUNC_QUAL int t() { return ArithTraits<mag_type>::t(); }              \
+  static FUNC_QUAL mag_type rnd() { return ArithTraits<mag_type>::rnd(); }     \
+  static FUNC_QUAL int emin() { return ArithTraits<mag_type>::emin(); }        \
+  static FUNC_QUAL mag_type rmin() { return ArithTraits<mag_type>::rmin(); }   \
+  static FUNC_QUAL int emax() { return ArithTraits<mag_type>::emax(); }        \
+  static FUNC_QUAL mag_type rmax() { return ArithTraits<mag_type>::rmax(); }   \
+  static FUNC_QUAL bool isInf(const val_type x) {                              \
+    return ArithTraits<mag_type>::isInf(x.real()) ||                           \
+           ArithTraits<mag_type>::isInf(x.imag());                             \
+  }                                                                            \
+  static FUNC_QUAL bool isNan(const val_type x) {                              \
+    return ArithTraits<mag_type>::isNan(x.real()) ||                           \
+           ArithTraits<mag_type>::isNan(x.imag());                             \
+  }                                                                            \
+  static FUNC_QUAL mag_type abs(const val_type x) { return ::Kokkos::abs(x); } \
+  static FUNC_QUAL mag_type real(const val_type x) { return x.real(); }        \
+  static FUNC_QUAL mag_type imag(const val_type x) { return x.imag(); }        \
+  static FUNC_QUAL val_type conj(const val_type x) {                           \
+    return ::Kokkos::conj(x);                                                  \
+  }                                                                            \
+  static FUNC_QUAL val_type pow(const val_type x, const val_type y) {          \
+    return Kokkos::pow(x, y);                                                  \
+  }                                                                            \
+  static FUNC_QUAL val_type pow(const val_type x, const mag_type y) {          \
+    return Kokkos::pow(x, y);                                                  \
+  }                                                                            \
+  static FUNC_QUAL val_type pow(const mag_type x, const val_type y) {          \
+    return Kokkos::pow(x, y);                                                  \
+  }                                                                            \
+  static FUNC_QUAL val_type sqrt(const val_type x) {                           \
+    return ::Kokkos::sqrt(x);                                                  \
+  }                                                                            \
+  static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); }   \
+  static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); }   \
+  static FUNC_QUAL val_type log10(const val_type x) {                          \
+    return Kokkos::log10(x);                                                   \
+  }                                                                            \
+  static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); }   \
+  static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); }   \
+  static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); }   \
+  static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \
+  static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \
+  static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \
+  static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \
+  static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \
+  static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \
+  static FUNC_QUAL bool isnaninf(const val_type& x) {                          \
+    return isNan(x) || isInf(x);                                               \
+  }                                                                            \
+  static FUNC_QUAL mag_type magnitude(const val_type x) { return abs(x); }     \
+  static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); }    \
+  static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); }   \
+  static FUNC_QUAL mag_type eps() { return epsilon(); }
+
+template <typename val_type>
+static KOKKOS_FUNCTION
+    typename std::enable_if<std::numeric_limits<val_type>::is_signed,
+                            val_type>::type
+    KokkosKernelsAbs(const val_type x) {
+  return Kokkos::abs(x);
+}
+
+template <typename val_type>
+static KOKKOS_FUNCTION
+    typename std::enable_if<!std::numeric_limits<val_type>::is_signed,
+                            val_type>::type
+    KokkosKernelsAbs(const val_type x) {
+  return x;
+}
+
+template <typename val_type>
+static KOKKOS_FUNCTION
+    typename std::enable_if<std::numeric_limits<val_type>::is_signed,
+                            val_type>::type
+    KokkosKernelsNan() {
+  return -1;
+}
+
+template <typename val_type>
+static KOKKOS_FUNCTION
+    typename std::enable_if<!std::numeric_limits<val_type>::is_signed,
+                            val_type>::type
+    KokkosKernelsNan() {
+  return Kokkos::Experimental::finite_max<val_type>::value;
+}
+
+#define KOKKOSKERNELS_ARITHTRAITS_INTEGRAL()                                  \
+                                                                              \
+  static constexpr bool is_specialized = true;                                \
+  static constexpr bool is_integer     = true;                                \
+  static constexpr bool is_exact       = true;                                \
+  static constexpr bool is_complex     = false;                               \
+  static constexpr bool has_infinity   = false;                               \
+                                                                              \
+  using magnitudeType   = mag_type;                                           \
+  using halfPrecision   = val_type;                                           \
+  using doublePrecision = val_type;                                           \
+                                                                              \
+  static constexpr bool isComplex            = false;                         \
+  static constexpr bool isOrdinal            = true;                          \
+  static constexpr bool isComparable         = true;                          \
+  static constexpr bool hasMachineParameters = false;                         \
+                                                                              \
+  static KOKKOS_FUNCTION val_type zero() { return static_cast<val_type>(0); } \
+  static KOKKOS_FUNCTION val_type one() { return static_cast<val_type>(1); }  \
+  static KOKKOS_FUNCTION val_type min() {                                     \
+    return Kokkos::Experimental::finite_min<val_type>::value;                 \
+  }                                                                           \
+  static KOKKOS_FUNCTION val_type max() {                                     \
+    return Kokkos::Experimental::finite_max<val_type>::value;                 \
+  }                                                                           \
+  static KOKKOS_FUNCTION val_type infinity() {                                \
+    return static_cast<val_type>(0);                                          \
+  }                                                                           \
+  static KOKKOS_FUNCTION val_type nan() {                                     \
+    return KokkosKernelsNan<val_type>();                                      \
+  }                                                                           \
+  static KOKKOS_FUNCTION bool isInf(const val_type) { return false; }         \
+  static KOKKOS_FUNCTION bool isNan(const val_type) { return false; }         \
+  static KOKKOS_FUNCTION mag_type abs(const val_type x) {                     \
+    return KokkosKernelsAbs(x);                                               \
+  }                                                                           \
+  static KOKKOS_FUNCTION mag_type real(const val_type x) {                    \
+    return Kokkos::real(x);                                                   \
+  }                                                                           \
+  static KOKKOS_FUNCTION mag_type imag(const val_type) { return zero(); }     \
+  static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; }        \
+  static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) {   \
+    return Kokkos::pow(x, y);                                                 \
+  }                                                                           \
+  static KOKKOS_FUNCTION val_type sqrt(const val_type x) {                    \
+    return static_cast<val_type>(Kokkos::sqrt(abs(x)));                       \
+  }                                                                           \
+  static KOKKOS_FUNCTION val_type cbrt(const val_type x) {                    \
+    return static_cast<val_type>(Kokkos::cbrt(abs(x)));                       \
+  }                                                                           \
+  static KOKKOS_FUNCTION val_type exp(const val_type x) {                     \
+    return static_cast<val_type>(Kokkos::exp(abs(x)));                        \
+  }                                                                           \
+  static KOKKOS_FUNCTION val_type log(const val_type x) {                     \
+    return static_cast<val_type>(Kokkos::log(abs(x)));                        \
+  }                                                                           \
+  static KOKKOS_FUNCTION val_type log10(const val_type x) {                   \
+    return static_cast<val_type>(Kokkos::log10(abs(x)));                      \
+  }                                                                           \
+  static KOKKOS_FUNCTION mag_type epsilon() { return zero(); }                \
+  static KOKKOS_FUNCTION magnitudeType magnitude(const val_type x) {          \
+    return abs(x);                                                            \
+  }                                                                           \
+  static KOKKOS_FUNCTION val_type conjugate(const val_type x) {               \
+    return conj(x);                                                           \
+  }                                                                           \
+  static KOKKOS_FUNCTION bool isnaninf(const val_type) { return false; }      \
+  static KOKKOS_FUNCTION val_type squareroot(const val_type x) {              \
+    return sqrt(x);                                                           \
+  }
+
+/// \class ArithTraits
+/// \brief Traits class for arithmetic on type T.
+/// \tparam T "Scalar" type of interest
+///
+/// This is a traits class for the "arithmetic" type T.  "Arithmetic
+/// types" include built-in signed and unsigned integer types,
+/// floating-point types, complex-valued types, and anything else that
+/// looks like these.  This class is useful for implementing numerical
+/// algorithms that are generic on the data type.  You may also use
+/// this class to query attributes of T, like whether it is signed or
+/// complex, or its precision.
+///
+/// We really did not want to implement this class or expose it to
+/// users.  It would be much better to use existing traits classes
+/// like std::numeric_limits.  We decided to implement and expose this
+/// class for the following reasons:
+/// <ol>
+/// <li> std::numeric_limits class methods cannot be used in CUDA
+///      device functions, since they themselves are not device
+///      functions </li>
+/// <li> Existing traits classes like std::numeric_limits do not
+///      provide enough information to implement algorithms that are
+///      agnostic of whether T is real-valued or complex-valued. </li>
+/// </ol>
+///
+/// All class methods must be suitable for parallel kernels, if the
+/// type T itself is suitable for parallel kernels.  In particular,
+/// specializations for types T that make sense to use on a CUDA
+/// device must mark all class methods as device (and host) functions,
+/// using the KOKKOS_FORCEINLINE_FUNCTION macro.  All class methods must be
+/// callable both inside and outside a parallel kernel (for CUDA, this
+/// means they must be marked as both device and host functions).
+///
+/// \section Kokkos_ArithTraits_compat Compatibility
+///
+/// Whenever possible, class methods in ArithTraits use the same names
+/// as their equivalents in the C++ Standard Library.  If this was not
+/// possible, for example with isInf and isNan, we explain why in
+/// their documentation.
+///
+/// This class has redundant typedefs and methods in order to maintain
+/// backwards compatibility with Teuchos::ScalarTraits, while
+/// preferring forwards (partial) compatibility with
+/// std::numeric_limits.  Users should prefer typedefs, \c bool
+/// constants, and class methods compatible with std::numeric_limits,
+/// to those from Teuchos::ScalarTraits.  The latter may go away at
+/// any time.  Furthermore, Teuchos::ScalarTraits contains methods
+/// that do not make sense for use as parallel device functions, in
+/// particular those relating to pseudorandom number generation that
+/// refer to hidden state, so we will never include all class methods
+/// from Teuchos::ScalarTraits in ArithTraits.
+///
+/// \section Kokkos_ArithTraits_unsupp Unsupported types on CUDA devices
+///
+/// CUDA does not support long double or std::complex<T> in device
+/// functions.  ArithTraits does have specializations for these types,
+/// but the class methods therein are not marked as device functions.
+///
+/// \section Kokkos_ArithTraits_whyNotC99 What about C99 integer types?
+///
+/// C99 and C++11 include typedefs int${N}_t and uint${N}_t, where N
+/// is the number of bits in the integer.  These typedefs are useful
+/// because they make the length of the type explicit.  Users are
+/// welcome to use these types as the template parameter of
+/// ArithTraits.
+///
+/// We chose not to use these types when <i>defining</i> full
+/// specializations of ArithTraits.  This is because the C99 integer
+/// types are typedefs, not types in themselves.  This makes it
+/// impossible to avoid duplicate or missing full specializations of
+/// ArithTraits.  For example, on my Mac, for CUDA 5.5, gcc 4.2.1, and
+/// Clang 3.2, <tt>int64_t</tt> is a typedef of <tt>long long</tt>,
+/// but <tt>long long</tt> and <tt>long</tt> are separate types, even
+/// though they have the same length (64 bits).  In contrast, on
+/// Windows (even Win64), <tt>long</tt> is a 32-bit type (but a
+/// distinct type from <tt>int</tt>), and <tt>long long</tt> is a
+/// 64-bit type.  Thus, if we define full specializations of
+/// ArithTraits using <i>only</i> the C99 integer types, we will be
+/// missing a specialization for <tt>long</tt> on at least one
+/// platform.
+///
+/// Rather than trouble ourselves with trying to figure this out for
+/// each platform, we decided to provide specializations only for the
+/// integer types in the C89 and C++03 language standards.  This
+/// includes signed and unsigned versions of <tt>char</tt>,
+/// <tt>short</tt>, <tt>int</tt>, and <tt>long</tt>.  We also include
+/// <tt>long long</tt> if your platform supports it.  We may thus have
+/// left out some C99 integer type, but this is only possible if the
+/// C89 / C++03 integer types do not have complete coverage of all
+/// powers of two bits from 8 up to the longest provided length (e.g.,
+/// 64 on a 64-bit system).  On all platforms I have encountered,
+/// <tt>char</tt> has 8 bits and <tt>short</tt> has 16 bits, so I am
+/// not worried about missing specializations for <tt>int16_t</tt> or
+/// <tt>uint16_t</tt>.  If you should find that either of these
+/// specializations are missing, though, please let us know.
+///
+/// Note that <tt>char</tt>, <tt>signed char</tt>, and <tt>unsigned
+/// char</tt> are distinct types, whether <tt>char</tt> is signed or
+/// unsigned.  (The language standards do not specify whether
+/// <tt>char</tt> is signed or unsigned.)  That is, <tt>char</tt> is
+/// <i>not</i> a typedef of <tt>signed char</tt> or <tt>unsigned
+/// char</tt>.  This is why we provide full specializations of
+/// ArithTraits for each of these types.  Interestingly enough, on my
+/// system, <tt>char</tt> and <tt>int8_t</tt> are different types, but
+/// <tt>signed char</tt> and <tt>int8_t</tt> are the same.
+///
+/// \section Kokkos_ArithTraits_impl Implementation notes
+///
+/// This section contains notes to developers who which to add a
+/// partial specialization of this class for a new type T.  If you
+/// decide to write a default templated implementation, it must not
+/// declare any methods as device functions.  This ensures correct
+/// behavior for arbitrary T, but does require specializations for
+/// common types like T = float and double, as well as for other types
+/// T that make sense to use on a CUDA device.
+template <class T>
+class ArithTraits {
+ public:
+  /// \brief A type that acts like T and works with Kokkos.
+  ///
+  /// This is usually just an alias for T.  However, some types T do
+  /// not work well with Kokkos.  In that case, we use a mostly
+  /// equivalent type here.  For example, ArithTraits<std::complex<R>
+  /// >::val_type is Kokkos::complex<R>.
+  using val_type = T;
+  /// \brief The type of the magnitude (absolute value) of T.
+  ///
+  /// We define this as the type returned by abs() in this class.  If
+  /// T is real (not complex), then \c val_type and \c mag_type are
+  /// usually the same.  If T is <tt>std::complex<R></tt> for some R,
+  /// then R and \c mag_type are usually the same.
+  using mag_type = T;
+
+  //! Whether ArithTraits has a specialization for T.
+  static constexpr bool is_specialized = false;
+  //! Whether T is a signed type (has negative values).
+  static constexpr bool is_signed = false;
+  //! Whether T is an integer type.
+  static constexpr bool is_integer = false;
+  /// \brief Whether T "uses exact representations."
+  ///
+  /// The opposite of is_exact is "is approximate," that is, "may
+  /// commit rounding error."
+  static constexpr bool is_exact = false;
+  //! Whether T is a complex-valued type.
+  static constexpr bool is_complex = false;
+
+  /// \brief Whether x is Inf.
+  ///
+  /// This can only be true for floating-point types T that support
+  /// Inf.  If T is a complex type, we say that a T instance x is Inf
+  /// if and only if <tt>isinf(real(x)) || isinf(imag(x))</tt>.
+  ///
+  /// Unfortunately we can't call this "isinf" (the equivalent C99
+  /// function), because CUDA appears to implement that function using
+  /// a macro, rather than using a function (as C++11 requires).
+  static KOKKOS_FUNCTION bool isInf(const T& x);
+
+  /// \brief Whether x is NaN (not a number).
+  ///
+  /// This can only be true for floating-point types T that support
+  /// NaN.  If T is a complex type, we say that a T instance x is NaN
+  /// if and only if <tt>isNan(real(x)) || isNan(imag(x))</tt>.
+  ///
+  /// Unfortunately we can't call this "isnan" (the equivalent C99
+  /// function), because CUDA appears to implement that function using
+  /// a macro, rather than using a function (as C++11 requires).
+  static KOKKOS_FUNCTION bool isNan(const T& x);
+
+  //! The absolute value (magnitude) of x.
+  static KOKKOS_FUNCTION mag_type abs(const T& x);
+
+  //! The zero value of T; the arithmetic identity.
+  static KOKKOS_FUNCTION T zero();
+
+  //! The one value of T; the multiplicative identity.
+  static KOKKOS_FUNCTION T one();
+
+  /// \brief True if this type T is capable of representing the
+  /// positive infinity as a distinct special value, as with
+  /// std::numeric_limits<T>::has_infinity.
+  static constexpr bool has_infinity = false;
+
+  /// \brief Returns the special value "positive infinity", as
+  /// represented by the floating-point type T. Only meaningful if
+  /// KokkosArithTraits<T>::has_infinity == true. Provides same
+  /// functionality as std::numeric_limits<T>::infinity().
+  ///
+  /// \note Would have liked to mark it as constexpr but then would
+  /// not be able to provide the specialization for std::complex<T>
+  /// since its constructor only becomes constexpr with C++14.
+  static KOKKOS_FUNCTION T infinity();
+
+  /// \brief The minimum possible value of T.
+  ///
+  /// If T is a real floating-point type, then this is the minimum
+  /// <i>positive</i> value, as with std::numeric_limits<T>::min().
+  static KOKKOS_FUNCTION T min();
+
+  //! The maximum possible value of T.
+  static KOKKOS_FUNCTION T max();
+
+  /// \brief The real part of x.
+  ///
+  /// If \c is_complex is false, then this just returns x.
+  static KOKKOS_FUNCTION mag_type real(const T& x);
+
+  /// \brief The imaginary part of x.
+  ///
+  /// If \c is_complex is false, then this just returns zero().
+  static KOKKOS_FUNCTION mag_type imag(const T&);
+
+  /// \brief The complex conjugate of x.
+  ///
+  /// If \c is_complex is false, then this just returns x.
+  static KOKKOS_FUNCTION T conj(const T&);
+
+  //! x raised to the power y.
+  static KOKKOS_FUNCTION T pow(const T& x, const T& y);
+
+  /// \brief The square root of x.
+  ///
+  /// If T is an integer type, this is the floor of the square root.
+  /// If T is a complex-valued type, then this method returns the
+  /// principal branch of the square root.
+  ///
+  /// If T is real-valued and x is negative, the result of the square
+  /// root is undefined in general.  (CUDA does not allow throwing
+  /// exceptions in device functions.)  Implementations should return
+  /// NaN if the type T supports this.  Of course, in that case, the
+  /// square of the result will not equal x.
+  static KOKKOS_FUNCTION T sqrt(const T& x);
+
+  /// \brief The cubic root of x.
+  ///
+  /// If T is an integer type, this is the floor of the cubic root.
+  /// If T is a complex-valued type, then this method returns the
+  /// principal branch of the cubic root.
+  ///
+  /// If T is real-valued and x is negative, the result of the cubic
+  /// root is undefined in general.  (CUDA does not allow throwing
+  /// exceptions in device functions.)  Implementations should return
+  /// NaN if the type T supports this.  Of course, in that case, the
+  /// cubic of the result will not equal x.
+  static KOKKOS_FUNCTION T cbrt(const T& x);
+
+  /// \brief The natural (base e) exponential function of x.
+  ///
+  /// If T is an integer type, this is the floor of the exponential
+  /// function.  If T is a complex-valued type, then this method
+  /// returns \f$e^{x+iy} = e^x ( cos(y) + i sin(y) )\f$.
+  ///
+  static KOKKOS_FUNCTION T exp(const T& x);
+
+  /// \brief The natural (base e) logarithm of x.
+  ///
+  /// If T is an integer type, this is the floor of the logarithm.  If
+  /// T is a complex-valued type, then this method returns the
+  /// principal branch of the logarithm.
+  ///
+  /// If T is real-valued and x is negative, the result of the
+  /// logarithm is undefined in general.  (CUDA does not allow
+  /// throwing exceptions in device functions.)  Implementations
+  /// should return NaN if the type T supports this.  Of course, in
+  /// that case, if y is the result, \f$e^y\f$ will not equal x.
+  static KOKKOS_FUNCTION T log(const T& x);
+
+  /// \brief The base ten logarithm of the input.
+  ///
+  /// If T is an integer type, this is the floor of the logarithm.  If
+  /// T is a complex-valued type, then this method returns the
+  /// principal branch of the logarithm.
+  ///
+  /// If T is real-valued and x is negative, the result of the
+  /// logarithm is undefined in general.  (CUDA does not allow
+  /// throwing exceptions in device functions.)  Implementations
+  /// should return NaN if the type T supports this.  Of course, in
+  /// that case, if y is the result, \f$10^y\f$ will not equal x.
+  static KOKKOS_FUNCTION T log10(const T& x);
+
+  /// Trigonometric and hyperbolic functions are not available
+  /// for integer types. This is because asin(sin(x)) is not x
+  /// when x is integer with a rounding error.
+  ///
+  ///  KJ: log, exp also has this problem. We probably need to
+  ///      disable them for integer types instead of providing
+  ///      functionality with floor.
+
+  /// \brief The sin function of x
+  ///
+  static KOKKOS_FUNCTION T sin(const T& x);
+
+  /// \brief The cos function of x
+  ///
+  static KOKKOS_FUNCTION T cos(const T& x);
+
+  /// \brief The tan function of x
+  ///
+  static KOKKOS_FUNCTION T tan(const T& x);
+
+  /// \brief The sin hyperbolic function of x
+  ///
+  static KOKKOS_FUNCTION T sinh(const T& x);
+
+  /// \brief The cos hyperbolic function of x
+  ///
+  static KOKKOS_FUNCTION T cosh(const T& x);
+
+  /// \brief The tan hyperbolic function of x
+  ///
+  static KOKKOS_FUNCTION T tanh(const T& x);
+
+  /// \brief The asin function of x
+  ///
+  static KOKKOS_FUNCTION T asin(const T& x);
+
+  /// \brief The acos function of x
+  ///
+  static KOKKOS_FUNCTION T acos(const T& x);
+
+  /// \brief The atan function of x
+  ///
+  static KOKKOS_FUNCTION T atan(const T& x);
+
+  /// \brief Return a silent NaN, if appropriate for T.
+  ///
+  /// If T does <i>not</i> implement a silent NaN, the return value is
+  /// undefined, but calling this method is still allowed.
+  static KOKKOS_FUNCTION T nan();
+
+  /// \brief Machine epsilon.
+  ///
+  /// If T is an integer type (std::numeric_traits<T>::is_exact is
+  /// true), then epsilon() returns 0.  Otherwise, if T is a
+  /// floating-point type, it returns machine epsilon that T.
+  static KOKKOS_FUNCTION mag_type epsilon();
+
+  //@{
+  /// \name Traits defined for backwards compatibility with
+  /// Teuchos::ScalarTraits
+  ///
+  /// All of the typedefs, \c bool constants, and class methods in
+  /// this section are defined in order that one may replace most uses
+  /// of Teuchos::ScalarTraits with ArithTraits.  Users who do not
+  /// have this backwards compatibility requirement should prefer
+  /// equivalents in other sections.  Those class methods which have
+  /// the same name and meaning in both Teuchos::ScalarTraits and this
+  /// class, such as log() and pow(), are not in this section.
+
+  //! Same as mag_type; the type of the absolute value (magnitude) of T.
+  using magnitudeType = T;
+
+  /// \brief The type with "half the precision" of T.
+  ///
+  /// This typedef only makes sense if T is a floating-point type.
+  using halfPrecision = T;
+
+  /// \brief The type with "twice the the precision" of T.
+  ///
+  /// This typedef only makes sense if T is a floating-point type.
+  using doublePrecision = T;
+
+  static constexpr bool isComplex    = false;
+  static constexpr bool isOrdinal    = false;
+  static constexpr bool isComparable = false;
+
+  /// \brief True if this type T has floating-point parameters.
+  ///
+  /// This is true if and only if this specialization of ArithTraits
+  /// has "machine-specific" parameters eps(), sfmin(), base(),
+  /// prec(), t(), rnd(), emin(), rmin(), emax(), and rmax(), relating
+  /// to floating-point types.
+  static constexpr bool hasMachineParameters = false;
+
+  //! Return relative machine precision.
+  static KOKKOS_FUNCTION mag_type eps();
+
+  //! Return safe minimum (sfmin), such that 1/sfmin does not overflow.
+  static KOKKOS_FUNCTION mag_type sfmin();
+
+  //! Return the base of the scalar type T.
+  static KOKKOS_FUNCTION int base();
+
+  //! Return <tt>eps*base</tt>.
+  static KOKKOS_FUNCTION mag_type prec();
+
+  //! Returns the number of (base) digits in the significand.
+  static KOKKOS_FUNCTION int t();
+
+  //! 1.0 when rounding occurs in addition, else 0.0.
+  static KOKKOS_FUNCTION mag_type rnd();
+
+  //! Returns the minimum exponent before (gradual) underflow.
+  static KOKKOS_FUNCTION int emin();
+
+  //! Returns the underflow threshold: <tt>base^(emin-1)</tt>
+  static KOKKOS_FUNCTION mag_type rmin();
+
+  //! Returns the largest exponent before overflow.
+  static KOKKOS_FUNCTION int emax();
+
+  //! Overflow theshold: <tt>(base^emax)*(1-eps)</tt>
+  static KOKKOS_FUNCTION mag_type rmax();
+
+  //! Same as abs(); return the magnitude of x.
+  static KOKKOS_FUNCTION magnitudeType magnitude(const T& x);
+
+  //! Same as conj(); return the complex conjugate of x.
+  static KOKKOS_FUNCTION T conjugate(const T& x);
+
+  /// \brief Whether x is (silent) NaN or Inf.
+  ///
+  /// This is the same as <tt>isNan(x) || isInf(x)</tt>.
+  static KOKKOS_FUNCTION bool isnaninf(const T& x);
+
+  /// \brief The string name of T.
+  ///
+  /// Note that this is not a device function.
+  static std::string name();
+
+  //! Same as sqrt(x); the square root of x.
+  static KOKKOS_FUNCTION T squareroot(const T& x);
+  //@}
+};
+
+// Since Kokkos::Experimental::half_t falls back to float, only define
+// ArithTraits if half_t is a backend specialization
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+template <>
+class ArithTraits<Kokkos::Experimental::half_t> {
+ public:
+  using val_type = Kokkos::Experimental::half_t;
+  using mag_type = val_type;
+
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed      = true;
+  static constexpr bool is_integer     = false;
+  static constexpr bool is_exact       = false;
+  static constexpr bool is_complex     = false;
+
+  static constexpr bool has_infinity = true;
+  static KOKKOS_FUNCTION val_type infinity() {
+    return Kokkos::Experimental::cast_to_half(
+        Kokkos::Experimental::infinity<float>::value);
+  }
+
+  static KOKKOS_FUNCTION bool isInf(const val_type x) {
+#ifndef __CUDA_ARCH__
+    using std::isinf;
+#endif
+    return isinf(Kokkos::Experimental::cast_from_half<float>(x));
+  }
+  static KOKKOS_FUNCTION bool isNan(const val_type x) {
+#ifndef __CUDA_ARCH__
+    using std::isnan;
+#endif
+    return isnan(Kokkos::Experimental::cast_from_half<float>(x));
+  }
+  static KOKKOS_FUNCTION mag_type abs(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        Kokkos::abs(Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type zero() {
+    return Kokkos::Experimental::cast_to_half(0.0);
+  }
+  static KOKKOS_FUNCTION val_type one() {
+    return Kokkos::Experimental::cast_to_half(1.0);
+  }
+  static KOKKOS_FUNCTION val_type min() {
+    return Kokkos::Experimental::cast_to_half(-KOKKOSKERNELS_IMPL_FP16_MAX);
+  }
+  static KOKKOS_FUNCTION val_type max() {
+    return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX);
+  }
+  static KOKKOS_FUNCTION mag_type real(const val_type x) { return x; }
+  static KOKKOS_FUNCTION mag_type imag(const val_type) { return zero(); }
+  static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; }
+  static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) {
+    return Kokkos::Experimental::cast_to_half(
+        Kokkos::pow(Kokkos::Experimental::cast_from_half<float>(x),
+                    Kokkos::Experimental::cast_from_half<float>(y)));
+  }
+  static KOKKOS_FUNCTION val_type sqrt(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        Kokkos::sqrt(Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type cbrt(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        Kokkos::cbrt(Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type exp(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        Kokkos::exp(Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type log(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        Kokkos::log(Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type log10(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        Kokkos::log10(Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type sin(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        Kokkos::sin(Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type cos(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        Kokkos::cos(Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type tan(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        Kokkos::tan(Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type sinh(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        Kokkos::sinh(Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type cosh(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        Kokkos::cosh(Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type tanh(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        Kokkos::tanh(Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type asin(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        Kokkos::asin(Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type acos(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        Kokkos::acos(Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type atan(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        Kokkos::atan(Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FUNCTION mag_type epsilon() {
+    return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_EPSILON);
+  }
+  // Backwards compatibility with Teuchos::ScalarTraits.
+  using magnitudeType = mag_type;
+  // C++ doesn't have a standard "half-float" type.
+  using halfPrecision   = val_type;
+  using doublePrecision = double;
+
+  static constexpr bool isComplex            = false;
+  static constexpr bool isOrdinal            = false;
+  static constexpr bool isComparable         = true;
+  static constexpr bool hasMachineParameters = true;
+  static KOKKOS_FUNCTION bool isnaninf(const val_type x) {
+    return isNan(x) || isInf(x);
+  }
+  static KOKKOS_FUNCTION magnitudeType magnitude(const val_type x) {
+    return abs(x);
+  }
+  static KOKKOS_FUNCTION val_type conjugate(const val_type x) {
+    return conj(x);
+  }
+  static std::string name() { return "half"; }
+  static KOKKOS_FUNCTION val_type squareroot(const val_type x) {
+    return sqrt(x);
+  }
+  static KOKKOS_FUNCTION val_type nan() {
+    return Kokkos::Experimental::cast_to_half(
+        Kokkos::Experimental::quiet_NaN<float>::value);
+  }
+  static KOKKOS_FUNCTION mag_type eps() { return epsilon(); }
+  static KOKKOS_FUNCTION mag_type sfmin() {
+    return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN);
+  }
+  static KOKKOS_FUNCTION int base() { return KOKKOSKERNELS_IMPL_FP16_RADIX; }
+  // Use float to allow running on both host and device
+  static KOKKOS_FUNCTION float prec() {
+    float e = KOKKOSKERNELS_IMPL_FP16_EPSILON;
+    float b = (float)base();
+    float r = e * b;
+    return r;
+  }
+  static KOKKOS_FUNCTION int t() { return KOKKOSKERNELS_IMPL_FP16_MANT_DIG; }
+  static KOKKOS_FUNCTION mag_type rnd() { return one(); }
+  static KOKKOS_FUNCTION int emin() { return KOKKOSKERNELS_IMPL_FP16_MIN_EXP; }
+  static KOKKOS_FUNCTION mag_type rmin() {
+    return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN);
+  }
+  static KOKKOS_FUNCTION int emax() { return KOKKOSKERNELS_IMPL_FP16_MAX_EXP; }
+  static KOKKOS_FUNCTION mag_type rmax() {
+    return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX);
+  }
+};
+#endif  // KOKKOS_HALF_T_IS_FLOAT && KOKKOS_ENABLE_CUDA_HALF
+
+// Since Kokkos::Experimental::bhalf_t falls back to float, only define
+// ArithTraits if bhalf_t is a backend specialization
+#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
+template <>
+class ArithTraits<Kokkos::Experimental::bhalf_t> {
+ public:
+  using val_type = Kokkos::Experimental::bhalf_t;
+  using mag_type = val_type;
+
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed      = true;
+  static constexpr bool is_integer     = false;
+  static constexpr bool is_exact       = false;
+  static constexpr bool is_complex     = false;
+
+  static constexpr bool has_infinity = true;
+  static KOKKOS_FUNCTION val_type infinity() {
+    return Kokkos::Experimental::cast_to_bhalf(
+        Kokkos::Experimental::infinity<float>::value);
+  }
+
+  static KOKKOS_FUNCTION bool isInf(const val_type x) {
+    return Kokkos::isinf(Kokkos::Experimental::cast_from_bhalf<float>(x));
+  }
+  static KOKKOS_FUNCTION bool isNan(const val_type x) {
+    return Kokkos::isnan(Kokkos::Experimental::cast_from_bhalf<float>(x));
+  }
+  static KOKKOS_FUNCTION mag_type abs(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        Kokkos::abs(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type zero() {
+    return Kokkos::Experimental::cast_to_bhalf(0.0F);
+  }
+  static KOKKOS_FUNCTION val_type one() {
+    return Kokkos::Experimental::cast_to_bhalf(1.0F);
+  }
+  static KOKKOS_FUNCTION val_type min() {
+    return Kokkos::Experimental::cast_to_bhalf(-KOKKOSKERNELS_IMPL_BF16_MAX);
+  }
+  static KOKKOS_FUNCTION val_type max() {
+    return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX);
+  }
+  static KOKKOS_FUNCTION mag_type real(const val_type x) { return x; }
+  static KOKKOS_FUNCTION mag_type imag(const val_type) {
+    return Kokkos::Experimental::cast_to_bhalf(0.0F);
+  }
+  static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; }
+  static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        Kokkos::pow(Kokkos::Experimental::cast_from_bhalf<float>(x),
+                    Kokkos::Experimental::cast_from_bhalf<float>(y)));
+  }
+  static KOKKOS_FUNCTION val_type sqrt(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        Kokkos::sqrt(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type cbrt(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        Kokkos::cbrt(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type exp(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        Kokkos::exp(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type log(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        Kokkos::log(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type log10(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        Kokkos::log10(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type sin(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        Kokkos::sin(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type cos(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        Kokkos::cos(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type tan(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        Kokkos::tan(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type sinh(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        Kokkos::sinh(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type cosh(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        Kokkos::cosh(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type tanh(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        Kokkos::tanh(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type asin(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        Kokkos::asin(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type acos(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        Kokkos::acos(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FUNCTION val_type atan(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        Kokkos::atan(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FUNCTION mag_type epsilon() {
+    // return ::pow(2, -KOKKOSKERNELS_IMPL_BF16_SIGNIFICAND_BITS);
+    return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_EPSILON);
+  }
+  // Backwards compatibility with Teuchos::ScalarTraits.
+  using magnitudeType = mag_type;
+  // C++ doesn't have a standard "bhalf-float" type.
+  using bhalfPrecision  = val_type;
+  using doublePrecision = double;
+
+  static constexpr bool isComplex            = false;
+  static constexpr bool isOrdinal            = false;
+  static constexpr bool isComparable         = true;
+  static constexpr bool hasMachineParameters = true;
+  static KOKKOS_FUNCTION bool isnaninf(const val_type x) {
+    return isNan(x) || isInf(x);
+  }
+  static KOKKOS_FUNCTION magnitudeType magnitude(const val_type x) {
+    return abs(x);
+  }
+  static KOKKOS_FUNCTION val_type conjugate(const val_type x) {
+    return conj(x);
+  }
+  static std::string name() { return "bhalf"; }
+  static KOKKOS_FUNCTION val_type squareroot(const val_type x) {
+    return sqrt(x);
+  }
+  static KOKKOS_FUNCTION val_type nan() {
+    return Kokkos::Experimental::cast_to_bhalf(
+        Kokkos::Experimental::quiet_NaN<float>::value);
+  }
+  static KOKKOS_FUNCTION mag_type eps() { return epsilon(); }
+  static KOKKOS_FUNCTION mag_type sfmin() {
+    return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MIN);
+  }
+  static KOKKOS_FUNCTION int base() { return KOKKOSKERNELS_IMPL_BF16_RADIX; }
+  // Use float to allow running on both host and device
+  static KOKKOS_FUNCTION float prec() {
+    float e = KOKKOSKERNELS_IMPL_BF16_EPSILON;
+    float b = (float)base();
+    float r = e * b;
+    return r;
+  }
+  static KOKKOS_FUNCTION int t() { return KOKKOSKERNELS_IMPL_BF16_MANT_DIG; }
+  static KOKKOS_FUNCTION mag_type rnd() { return one(); }
+  static KOKKOS_FUNCTION int emin() { return KOKKOSKERNELS_IMPL_BF16_MIN_EXP; }
+  static KOKKOS_FUNCTION mag_type rmin() {
+    return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MIN);
+  }
+  static KOKKOS_FUNCTION int emax() { return KOKKOSKERNELS_IMPL_BF16_MAX_EXP; }
+  static KOKKOS_FUNCTION mag_type rmax() {
+    return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX);
+  }
+};
+#endif  // KOKKOS_BHALF_T_IS_FLOAT
+
+template <>
+class ArithTraits<float> {
+ public:
+  using val_type = float;
+  using mag_type = val_type;
+
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed      = true;
+  static constexpr bool is_integer     = false;
+  static constexpr bool is_exact       = false;
+  static constexpr bool is_complex     = false;
+  static constexpr bool has_infinity   = true;
+
+  // Backwards compatibility with Teuchos::ScalarTraits.
+  using magnitudeType   = mag_type;
+  using halfPrecision   = float;  // Should we switch to Kokkos::half_t
+  using doublePrecision = double;
+
+  static constexpr bool isComplex            = false;
+  static constexpr bool isOrdinal            = false;
+  static constexpr bool isComparable         = true;
+  static constexpr bool hasMachineParameters = true;
+
+  static std::string name() { return "float"; }
+
+  KOKKOSKERNELS_ARITHTRAITS_REAL_FP(KOKKOS_FUNCTION)
+};
+
+template <>
+class ArithTraits<double> {
+ public:
+  using val_type = double;
+  using mag_type = val_type;
+
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed      = true;
+  static constexpr bool is_integer     = false;
+  static constexpr bool is_exact       = false;
+  static constexpr bool is_complex     = false;
+  static constexpr bool has_infinity   = true;
+
+  // Backwards compatibility with Teuchos::ScalarTraits.
+  using magnitudeType = mag_type;
+  using halfPrecision = float;
+#if defined(__CUDA_ARCH__)
+  using doublePrecision =
+      double;  // CUDA doesn't support long double, unfortunately
+#elif defined(__HIP_DEVICE_COMPILE__)
+  using doublePrecision =
+      double;  // HIP does not support long double unfortunately
+#else
+  using doublePrecision = long double;
+#endif  // __CUDA_ARCH__
+  static constexpr bool isComplex            = false;
+  static constexpr bool isOrdinal            = false;
+  static constexpr bool isComparable         = true;
+  static constexpr bool hasMachineParameters = true;
+
+  static std::string name() { return "double"; }
+
+  KOKKOSKERNELS_ARITHTRAITS_REAL_FP(KOKKOS_FUNCTION)
+};
+
+// CUDA and HIP do not support long double in device functions,
+// so none of the class methods in this specialization are marked
+// as device functions.
+template <>
+class ArithTraits<long double> {
+ public:
+  using val_type = long double;
+  using mag_type = long double;
+
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed      = true;
+  static constexpr bool is_integer     = false;
+  static constexpr bool is_exact       = false;
+  static constexpr bool is_complex     = false;
+  static constexpr bool has_infinity   = true;
+
+  // Backwards compatibility with Teuchos::ScalarTraits.
+  using magnitudeType = mag_type;
+  using halfPrecision = double;
+  // It might be appropriate to use QD's qd_real here.
+  // For now, long double is the most you get.
+  using doublePrecision = val_type;
+
+  static constexpr bool isComplex            = false;
+  static constexpr bool isOrdinal            = false;
+  static constexpr bool isComparable         = true;
+  static constexpr bool hasMachineParameters = true;
+
+  static std::string name() { return "long double"; }
+
+  KOKKOSKERNELS_ARITHTRAITS_REAL_FP()
+};  // long double specialization
+
+#if defined(KOKKOS_ENABLE_LIBQUADMATH)
+// CUDA does not support __float128 in device functions, so none of
+// the class methods in this specialization are marked as device
+// functions.
+template <>
+class ArithTraits<__float128> {
+ public:
+  using val_type = __float128;
+  using mag_type = val_type;
+
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed      = true;
+  static constexpr bool is_integer     = false;
+  static constexpr bool is_exact       = false;
+  static constexpr bool is_complex     = false;
+  static constexpr bool has_infinity   = true;
+
+  // Backwards compatibility with Teuchos::ScalarTraits.
+  using magnitudeType = mag_type;
+  using halfPrecision = double;
+  // Unfortunately, we can't rely on a standard __float256 type.
+  using doublePrecision = __float128;
+
+  static constexpr bool isComplex            = false;
+  static constexpr bool isOrdinal            = false;
+  static constexpr bool isComparable         = true;
+  static constexpr bool hasMachineParameters = true;
+
+  static std::string name() { return "__float128"; }
+
+  KOKKOSKERNELS_ARITHTRAITS_REAL_FP()
+};      // __float128 specialization
+#endif  // KOKKOS_ENABLE_LIBQUADMATH
+
+template <>
+class ArithTraits< ::Kokkos::complex<float> > {
+ public:
+  using val_type = ::Kokkos::complex<float>;
+  using mag_type = float;
+
+  static std::string name() { return "Kokkos::complex<float>"; }
+
+  KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(KOKKOS_FUNCTION)
+};
+
+template <>
+class ArithTraits< ::Kokkos::complex<double> > {
+ public:
+  using val_type = ::Kokkos::complex<double>;
+  using mag_type = double;
+
+  static std::string name() { return "Kokkos::complex<double>"; }
+
+  KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(KOKKOS_FUNCTION)
+};
+
+/// \brief Partial specialization for std::complex<RealFloatType>.
+///
+/// The C++ Standard Library (with C++03 at least) only allows
+/// std::complex<RealFloatType> for RealFloatType = float, double, or
+/// long double.
+template <class RealFloatType>
+class ArithTraits<std::complex<RealFloatType> > {
+ public:
+  //! Kokkos internally replaces std::complex with Kokkos::complex.
+  using val_type = ::Kokkos::complex<RealFloatType>;
+  using mag_type = RealFloatType;
+
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed      = true;
+  static constexpr bool is_integer     = false;
+  static constexpr bool is_exact       = false;
+  static constexpr bool is_complex     = true;
+
+  static constexpr bool has_infinity = true;
+  static std::complex<RealFloatType> infinity() {
+    return std::complex<RealFloatType>(ArithTraits<mag_type>::infinity(),
+                                       ArithTraits<mag_type>::infinity());
+  }
+
+#ifdef KOKKOS_ENABLE_SYCL
+  template <typename Dummy = RealFloatType>
+  static bool isInf(const std::complex<Dummy>& x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    using std::isinf;
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
+    using sycl::isinf;
+#endif
+    return isinf(real(x)) || isinf(imag(x));
+  }
+  template <>
+  static bool isInf<long double>(const std::complex<long double>& x) {
+    Kokkos::abort("isInf not available for std::complex<long double>!\n");
+    return true;
+  }
+#else
+  static bool isInf(const std::complex<RealFloatType>& x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    using std::isinf;
+#endif
+    return isinf(real(x)) || isinf(imag(x));
+  }
+#endif
+#ifdef KOKKOS_ENABLE_SYCL
+  template <typename Dummy = RealFloatType>
+  static bool isNan(const std::complex<Dummy>& x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    using std::isnan;
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
+    using sycl::isnan;
+#endif
+    return isnan(real(x)) || isnan(imag(x));
+  }
+  template <>
+  static bool isNan<long double>(const std::complex<long double>& x) {
+    Kokkos::abort("isNan not available for std::complex<long double>!\n");
+    return true;
+  }
+#else
+  static bool isNan(const std::complex<RealFloatType>& x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    using std::isnan;
+#endif
+    return isnan(real(x)) || isnan(imag(x));
+  }
+#endif
+  static mag_type abs(const std::complex<RealFloatType>& x) {
+    return std::abs(x);
+  }
+  static std::complex<RealFloatType> zero() {
+    return std::complex<RealFloatType>(ArithTraits<mag_type>::zero(),
+                                       ArithTraits<mag_type>::zero());
+  }
+  static std::complex<RealFloatType> one() {
+    return std::complex<RealFloatType>(ArithTraits<mag_type>::one(),
+                                       ArithTraits<mag_type>::zero());
+  }
+  static std::complex<RealFloatType> min() {
+    return std::complex<RealFloatType>(ArithTraits<mag_type>::min(),
+                                       ArithTraits<mag_type>::zero());
+  }
+  static std::complex<RealFloatType> max() {
+    return std::complex<RealFloatType>(ArithTraits<mag_type>::max(),
+                                       ArithTraits<mag_type>::zero());
+  }
+  static mag_type real(const std::complex<RealFloatType>& x) {
+    return std::real(x);
+  }
+  static mag_type imag(const std::complex<RealFloatType>& x) {
+    return std::imag(x);
+  }
+  static std::complex<RealFloatType> conj(
+      const std::complex<RealFloatType>& x) {
+    return std::conj(x);
+  }
+  static std::complex<RealFloatType> pow(const std::complex<RealFloatType>& x,
+                                         const std::complex<RealFloatType>& y) {
+    // Fix for some weird gcc 4.2.1 inaccuracy.
+    if (y == one()) {
+      return x;
+    } else if (y == one() + one()) {
+      return x * x;
+    } else {
+      return std::pow(x, y);
+    }
+  }
+  static std::complex<RealFloatType> pow(const std::complex<RealFloatType>& x,
+                                         const RealFloatType& y) {
+    // Fix for some weird gcc 4.2.1 inaccuracy.
+    if (y == ArithTraits<RealFloatType>::one()) {
+      return x;
+    } else if (y == ArithTraits<RealFloatType>::one() +
+                        ArithTraits<RealFloatType>::one()) {
+      return x * x;
+    } else {
+      return std::pow(x, y);
+    }
+  }
+  static std::complex<RealFloatType> sqrt(
+      const std::complex<RealFloatType>& x) {
+    return std::sqrt(x);
+  }
+  static std::complex<RealFloatType> cbrt(
+      const std::complex<RealFloatType>& x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::cbrt(x);
+#else
+    return ::cbrt(x);
+#endif
+  }
+  static std::complex<RealFloatType> exp(const std::complex<RealFloatType>& x) {
+    return std::exp(x);
+  }
+  static std::complex<RealFloatType> log(const std::complex<RealFloatType>& x) {
+    return std::log(x);
+  }
+  static std::complex<RealFloatType> log10(
+      const std::complex<RealFloatType>& x) {
+    return std::log10(x);
+  }
+  static std::complex<RealFloatType> sin(const std::complex<RealFloatType>& x) {
+    return std::sin(x);
+  }
+  static std::complex<RealFloatType> cos(const std::complex<RealFloatType>& x) {
+    return std::cos(x);
+  }
+  static std::complex<RealFloatType> tan(const std::complex<RealFloatType>& x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::tan(x);
+#else
+    return std::tan(x);
+#endif
+  }
+  static std::complex<RealFloatType> sinh(
+      const std::complex<RealFloatType>& x) {
+    return std::sinh(x);
+  }
+  static std::complex<RealFloatType> cosh(
+      const std::complex<RealFloatType>& x) {
+    return std::cosh(x);
+  }
+  static std::complex<RealFloatType> tanh(
+      const std::complex<RealFloatType>& x) {
+    return std::tanh(x);
+  }
+  static std::complex<RealFloatType> asin(
+      const std::complex<RealFloatType>& x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::asin(x);
+#else
+    return ::asin(x);
+#endif
+  }
+  static std::complex<RealFloatType> acos(
+      const std::complex<RealFloatType>& x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::acos(x);
+#else
+    return ::acos(x);
+#endif
+  }
+  static std::complex<RealFloatType> atan(
+      const std::complex<RealFloatType>& x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    using sycl::atan;
+#else
+    using std::atan;
+#endif
+    return atan(x);
+  }
+  static std::complex<RealFloatType> nan() {
+    const mag_type mag_nan = ArithTraits<mag_type>::nan();
+    return std::complex<RealFloatType>(mag_nan, mag_nan);
+  }
+  static mag_type epsilon() { return ArithTraits<mag_type>::epsilon(); }
+
+  // Backwards compatibility with Teuchos::ScalarTraits.
+  using magnitudeType = mag_type;
+  using halfPrecision =
+      std::complex<typename ArithTraits<mag_type>::halfPrecision>;
+  using doublePrecision =
+      std::complex<typename ArithTraits<mag_type>::doublePrecision>;
+
+  static constexpr bool isComplex            = true;
+  static constexpr bool isOrdinal            = false;
+  static constexpr bool isComparable         = false;
+  static constexpr bool hasMachineParameters = true;
+  static bool isnaninf(const std::complex<RealFloatType>& x) {
+    return isNan(x) || isInf(x);
+  }
+  static mag_type magnitude(const std::complex<RealFloatType>& x) {
+    return abs(x);
+  }
+  static std::complex<RealFloatType> conjugate(
+      const std::complex<RealFloatType>& x) {
+    return conj(x);
+  }
+  static std::string name() {
+    return std::string("std::complex<") + ArithTraits<mag_type>::name() + ">";
+  }
+  static std::complex<RealFloatType> squareroot(
+      const std::complex<RealFloatType>& x) {
+    return sqrt(x);
+  }
+  static mag_type eps() { return epsilon(); }
+  static mag_type sfmin() { return ArithTraits<mag_type>::sfmin(); }
+  static int base() { return ArithTraits<mag_type>::base(); }
+  static mag_type prec() { return ArithTraits<mag_type>::prec(); }
+  static int t() { return ArithTraits<mag_type>::t(); }
+  static mag_type rnd() { return ArithTraits<mag_type>::one(); }
+  static int emin() { return ArithTraits<mag_type>::emin(); }
+  static mag_type rmin() { return ArithTraits<mag_type>::rmin(); }
+  static int emax() { return ArithTraits<mag_type>::emax(); }
+  static mag_type rmax() { return ArithTraits<mag_type>::rmax(); }
+};
+
+template <>
+class ArithTraits<char> {
+ public:
+  using val_type = char;
+  using mag_type = val_type;
+
+  // The C(++) standard does not require that char be signed.  In
+  // fact, signed char, unsigned char, and char are distinct types.
+  // We can use std::numeric_limits here because it's a const bool,
+  // not a class method.
+  static constexpr bool is_signed = std::numeric_limits<val_type>::is_signed;
+
+  static std::string name() { return "char"; }
+
+  KOKKOSKERNELS_ARITHTRAITS_INTEGRAL()
+};
+
+template <>
+class ArithTraits<signed char> {
+ public:
+  using val_type = signed char;
+  using mag_type = val_type;
+
+  static constexpr bool is_signed = true;
+
+  static std::string name() { return "signed char"; }
+
+  KOKKOSKERNELS_ARITHTRAITS_INTEGRAL()
+};
+
+template <>
+class ArithTraits<unsigned char> {
+ public:
+  using val_type = unsigned char;
+  using mag_type = val_type;
+
+  static constexpr bool is_signed = false;
+
+  static std::string name() { return "unsigned char"; }
+
+  KOKKOSKERNELS_ARITHTRAITS_INTEGRAL()
+};
+
+template <>
+class ArithTraits<short> {
+ public:
+  using val_type = short;
+  using mag_type = val_type;
+
+  static constexpr bool is_signed = true;
+
+  static std::string name() { return "short"; }
+
+  KOKKOSKERNELS_ARITHTRAITS_INTEGRAL()
+};
+
+template <>
+class ArithTraits<unsigned short> {
+ public:
+  using val_type = unsigned short;
+  using mag_type = val_type;
+
+  static constexpr bool is_signed = false;
+
+  static std::string name() { return "unsigned short"; }
+
+  KOKKOSKERNELS_ARITHTRAITS_INTEGRAL()
+};
+
+template <>
+class ArithTraits<int> {
+ public:
+  using val_type = int;
+  using mag_type = val_type;
+
+  static constexpr bool is_signed = true;
+
+  static std::string name() { return "int"; }
+
+  KOKKOSKERNELS_ARITHTRAITS_INTEGRAL()
+};
+
+template <>
+class ArithTraits<unsigned int> {
+ public:
+  using val_type = unsigned int;
+  using mag_type = val_type;
+
+  static constexpr bool is_signed = false;
+
+  static std::string name() { return "unsigned int"; }
+
+  KOKKOSKERNELS_ARITHTRAITS_INTEGRAL()
+};
+
+template <>
+class ArithTraits<long> {
+ public:
+  using val_type = long;
+  using mag_type = val_type;
+
+  static constexpr bool is_signed = true;
+
+  static std::string name() { return "long"; }
+
+  KOKKOSKERNELS_ARITHTRAITS_INTEGRAL()
+};
+
+template <>
+class ArithTraits<unsigned long> {
+ public:
+  using val_type = unsigned long;
+  using mag_type = val_type;
+
+  static constexpr bool is_signed = false;
+
+  static std::string name() { return "unsigned long"; }
+
+  KOKKOSKERNELS_ARITHTRAITS_INTEGRAL()
+};
+
+template <>
+class ArithTraits<long long> {
+ public:
+  using val_type = long long;
+  using mag_type = val_type;
+
+  static constexpr bool is_signed = true;
+
+  static std::string name() { return "long long"; }
+
+  KOKKOSKERNELS_ARITHTRAITS_INTEGRAL()
+};
+
+template <>
+class ArithTraits<unsigned long long> {
+ public:
+  using val_type = unsigned long long;
+  using mag_type = val_type;
+
+  static constexpr bool is_signed = false;
+
+  static std::string name() { return "unsigned long long"; }
+
+  KOKKOSKERNELS_ARITHTRAITS_INTEGRAL()
+};
+
+// dd_real and qd_real are floating-point types provided by the QD
+// library of David Bailey (LBNL):
+//
+// http://crd-legacy.lbl.gov/~dhbailey/mpdist/
+//
+// dd_real uses two doubles (128 bits), and qd_real uses four doubles
+// (256 bits).
+//
+// Kokkos does <i>not</i> currently support these types in device
+// functions.  It should be possible to use Kokkos' support for
+// aggregate types to implement device function support for dd_real
+// and qd_real, but we have not done this yet (as of 09 Jan 2015).
+// Hence, the class methods of the ArithTraits specializations for
+// dd_real and qd_real are not marked as device functions.
+#ifdef HAVE_KOKKOS_QD
+// LBV: I would like to deprecate this strange optional
+// dependency on the lbnl package, is there anyone actully
+// using this? It certainly is never tested by CI or nightly
+// so probably does not work...
+template <>
+struct [[deprecated]] ArithTraits<dd_real> {
+  typedef dd_real val_type;
+  typedef dd_real mag_type;
+
+  static const bool is_specialized = true;
+  static const bool is_signed      = true;
+  static const bool is_integer     = false;
+  static const bool is_exact       = false;
+  static const bool is_complex     = false;
+
+  static inline bool isInf(const val_type& x) { return isinf(x); }
+  static inline bool isNan(const val_type& x) { return isnan(x); }
+  static inline mag_type abs(const val_type& x) { return ::abs(x); }
+  static inline val_type zero() { return val_type(0.0); }
+  static inline val_type one() { return val_type(1.0); }
+  static inline val_type min() { return std::numeric_limits<val_type>::min(); }
+  static inline val_type max() { return std::numeric_limits<val_type>::max(); }
+  static inline mag_type real(const val_type& x) { return x; }
+  static inline mag_type imag(const val_type&) { return zero(); }
+  static inline val_type conj(const val_type& x) { return x; }
+  static inline val_type pow(const val_type& x, const val_type& y) {
+    return ::pow(x, y);
+  }
+  static inline val_type sqrt(const val_type& x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::sqrt(x);
+#else
+    return ::sqrt(x);
+#endif
+  }
+  static inline val_type cbrt(const val_type& x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::cbrt(x);
+#else
+    return ::cbrt(x);
+#endif
+  }
+  static inline val_type exp(const val_type& x) { return ::exp(x); }
+  static inline val_type log(const val_type& x) {
+    // dd_real puts its transcendental functions in the global namespace.
+    return ::log(x);
+  }
+  static inline val_type log10(const val_type& x) { return ::log10(x); }
+  static KOKKOS_FUNCTION val_type sin(const val_type x) { return ::sin(x); }
+  static KOKKOS_FUNCTION val_type cos(const val_type x) { return ::cos(x); }
+  static KOKKOS_FUNCTION val_type tan(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::tan(x);
+#else
+    return std::tan(x);
+#endif
+  }
+  static KOKKOS_FUNCTION val_type sinh(const val_type x) { return ::sinh(x); }
+  static KOKKOS_FUNCTION val_type cosh(const val_type x) { return ::cosh(x); }
+  static KOKKOS_FUNCTION val_type tanh(const val_type x) { return ::tanh(x); }
+  static KOKKOS_FUNCTION val_type asin(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::asin(x);
+#else
+    return ::asin(x);
+#endif
+  }
+  static KOKKOS_FUNCTION val_type acos(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::acos(x);
+#else
+    return ::acos(x);
+#endif
+  }
+  static KOKKOS_FUNCTION val_type atan(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::atan(x);
+#else
+    return ::atan(x);
+#endif
+  }
+  static inline val_type nan() { return val_type::_nan; }
+  static val_type epsilon() { return std::numeric_limits<val_type>::epsilon(); }
+
+  typedef dd_real magnitudeType;
+  typedef double halfPrecision;
+  typedef qd_real doublePrecision;
+
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = false;
+  static const bool isComparable         = true;
+  static const bool hasMachineParameters = true;
+
+  static mag_type eps() { return epsilon(); }
+  static mag_type sfmin() { return min(); }
+  static int base() { return std::numeric_limits<val_type>::radix; }
+  static mag_type prec() { return eps() * base(); }
+  static int t() { return std::numeric_limits<val_type>::digits; }
+  static mag_type rnd() {
+    return std::numeric_limits<val_type>::round_style == std::round_to_nearest
+               ? one()
+               : zero();
+  }
+  static int emin() { return std::numeric_limits<val_type>::min_exponent; }
+  static mag_type rmin() { return std::numeric_limits<val_type>::min(); }
+  static int emax() { return std::numeric_limits<val_type>::max_exponent; }
+  static mag_type rmax() { return std::numeric_limits<val_type>::max(); }
+  static mag_type magnitude(const val_type& x) { return ::abs(x); }
+  static val_type conjugate(const val_type& x) { return conj(x); }
+  static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); }
+  static std::string name() { return "dd_real"; }
+  static val_type squareroot(const val_type& x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::sqrt(x);
+#else
+    return ::sqrt(x);
+#endif
+  }
+};
+
+template <>
+struct [[deprecated]] ArithTraits<qd_real> {
+  typedef qd_real val_type;
+  typedef qd_real mag_type;
+
+  static const bool is_specialized = true;
+  static const bool is_signed      = true;
+  static const bool is_integer     = false;
+  static const bool is_exact       = false;
+  static const bool is_complex     = false;
+
+  static inline bool isInf(const val_type& x) { return isinf(x); }
+  static inline bool isNan(const val_type& x) { return isnan(x); }
+  static inline mag_type abs(const val_type& x) { return ::abs(x); }
+  static inline val_type zero() { return val_type(0.0); }
+  static inline val_type one() { return val_type(1.0); }
+  static inline val_type min() { return std::numeric_limits<val_type>::min(); }
+  static inline val_type max() { return std::numeric_limits<val_type>::max(); }
+  static inline mag_type real(const val_type& x) { return x; }
+  static inline mag_type imag(const val_type&) { return zero(); }
+  static inline val_type conj(const val_type& x) { return x; }
+  static inline val_type pow(const val_type& x, const val_type& y) {
+    return ::pow(x, y);
+  }
+  static inline val_type sqrt(const val_type& x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::sqrt(x);
+#else
+    return ::sqrt(x);
+#endif
+  }
+  static inline val_type cbrt(const val_type& x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::cbrt(x);
+#else
+    return ::cbrt(x);
+#endif
+  }
+  static inline val_type exp(const val_type& x) { return ::exp(x); }
+  static inline val_type log(const val_type& x) {
+    // val_type puts its transcendental functions in the global namespace.
+    return ::log(x);
+  }
+  static inline val_type log10(const val_type& x) { return ::log10(x); }
+  static KOKKOS_FUNCTION val_type sin(const val_type x) { return ::sin(x); }
+  static KOKKOS_FUNCTION val_type cos(const val_type x) { return ::cos(x); }
+  static KOKKOS_FUNCTION val_type tan(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::tan(x);
+#else
+    return std::tan(x);
+#endif
+  }
+  static KOKKOS_FUNCTION val_type sinh(const val_type x) { return ::sinh(x); }
+  static KOKKOS_FUNCTION val_type cosh(const val_type x) { return ::cosh(x); }
+  static KOKKOS_FUNCTION val_type tanh(const val_type x) { return ::tanh(x); }
+  static KOKKOS_FUNCTION val_type asin(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::asin(x);
+#else
+    return ::asin(x);
+#endif
+  }
+  static KOKKOS_FUNCTION val_type acos(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::acos(x);
+#else
+    return ::acos(x);
+#endif
+  }
+  static KOKKOS_FUNCTION val_type atan(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::atan(x);
+#else
+    return ::atan(x);
+#endif
+  }
+  static inline val_type nan() { return val_type::_nan; }
+  static inline val_type epsilon() {
+    return std::numeric_limits<val_type>::epsilon();
+  }
+
+  typedef qd_real magnitudeType;
+  typedef dd_real halfPrecision;
+  // The QD library does not have an "oct-double real" class.  One
+  // could use an arbitrary-precision library like MPFR or ARPREC,
+  // with the precision set appropriately, to get an
+  // extended-precision type for qd_real.
+  typedef qd_real doublePrecision;
+
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = false;
+  static const bool isComparable         = true;
+  static const bool hasMachineParameters = true;
+
+  static mag_type eps() { return epsilon(); }
+  static mag_type sfmin() { return min(); }
+  static int base() { return std::numeric_limits<val_type>::radix; }
+  static mag_type prec() { return eps() * base(); }
+  static int t() { return std::numeric_limits<val_type>::digits; }
+  static mag_type rnd() {
+    return std::numeric_limits<val_type>::round_style == std::round_to_nearest
+               ? one()
+               : zero();
+  }
+  static int emin() { return std::numeric_limits<val_type>::min_exponent; }
+  static mag_type rmin() { return std::numeric_limits<val_type>::min(); }
+  static int emax() { return std::numeric_limits<val_type>::max_exponent; }
+  static mag_type rmax() { return std::numeric_limits<val_type>::max(); }
+  static mag_type magnitude(const val_type& x) { return ::abs(x); }
+  static val_type conjugate(const val_type& x) { return conj(x); }
+  static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); }
+  static std::string name() { return "qd_real"; }
+  static val_type squareroot(const val_type& x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::sqrt(x);
+#else
+    return ::sqrt(x);
+#endif
+  }
+};
+#endif  // HAVE_KOKKOS_QD
+
+}  // namespace Details
+
+// Promote ArithTraits into Kokkos namespace.  At some point, we
+// will remove it from the Details namespace completely.  We leave
+// it there for now, because a lot of code depends on it being
+// there.
+using Details::ArithTraits;
+}  // namespace Kokkos
+
+#endif  // KOKKOS_ARITHTRAITS_HPP
diff --git a/src/Kokkos_InnerProductSpaceTraits.hpp b/src/common/Kokkos_InnerProductSpaceTraits.hpp
similarity index 100%
rename from src/Kokkos_InnerProductSpaceTraits.hpp
rename to src/common/Kokkos_InnerProductSpaceTraits.hpp
diff --git a/src/graph/KokkosGraph_Distance1Color.hpp b/src/graph/KokkosGraph_Distance1Color.hpp
index 3001ea660c..aca6414c83 100644
--- a/src/graph/KokkosGraph_Distance1Color.hpp
+++ b/src/graph/KokkosGraph_Distance1Color.hpp
@@ -44,8 +44,8 @@
 #ifndef _KOKKOSGRAPH_DISTANCE1_COLOR_HPP
 #define _KOKKOSGRAPH_DISTANCE1_COLOR_HPP
 
-#include "KokkosGraph_Distance1ColorHandle.hpp"
-#include "KokkosGraph_Distance1Color_impl.hpp"
+#include "KokkosGraph_color_d1_spec.hpp"
+#include "KokkosKernels_helpers.hpp"
 #include "KokkosKernels_Utils.hpp"
 
 namespace KokkosGraph {
@@ -59,81 +59,35 @@ void graph_color_symbolic(KernelHandle *handle,
                           typename KernelHandle::nnz_lno_t /* num_cols */,
                           lno_row_view_t_ row_map, lno_nnz_view_t_ entries,
                           bool /* is_symmetric */ = true) {
-  Kokkos::Timer timer;
-
-  typename KernelHandle::GraphColoringHandleType *gch =
-      handle->get_graph_coloring_handle();
-
-  ColoringAlgorithm algorithm = gch->get_coloring_algo_type();
-
-  typedef typename KernelHandle::GraphColoringHandleType::color_view_t
-      color_view_type;
-
-  gch->set_tictoc(handle->get_verbose());
-
-  color_view_type colors_out;
-  if (gch->get_vertex_colors().use_count() > 0) {
-    colors_out = gch->get_vertex_colors();
-  } else {
-    colors_out = color_view_type("Graph Colors", num_rows);
-  }
-
-  typedef
-      typename Impl::GraphColor<typename KernelHandle::GraphColoringHandleType,
-                                lno_row_view_t_, lno_nnz_view_t_>
-          BaseGraphColoring;
-  BaseGraphColoring *gc = NULL;
-
-  switch (algorithm) {
-    case COLORING_SERIAL:
-      gc = new BaseGraphColoring(num_rows, entries.extent(0), row_map, entries,
-                                 gch);
-      break;
-
-    case COLORING_VB:
-    case COLORING_VBBIT:
-    case COLORING_VBCS:
-      typedef typename Impl::GraphColor_VB<
-          typename KernelHandle::GraphColoringHandleType, lno_row_view_t_,
-          lno_nnz_view_t_>
-          VBGraphColoring;
-      gc = new VBGraphColoring(num_rows, entries.extent(0), row_map, entries,
-                               gch);
-      break;
-
-    case COLORING_VBD:
-    case COLORING_VBDBIT:
-      typedef typename Impl::GraphColor_VBD<
-          typename KernelHandle::GraphColoringHandleType, lno_row_view_t_,
-          lno_nnz_view_t_>
-          VBDGraphColoring;
-      gc = new VBDGraphColoring(num_rows, entries.extent(0), row_map, entries,
-                                gch);
-      break;
-
-    case COLORING_EB:
-      typedef typename Impl::GraphColor_EB<
-          typename KernelHandle::GraphColoringHandleType, lno_row_view_t_,
-          lno_nnz_view_t_>
-          EBGraphColoring;
-      gc = new EBGraphColoring(num_rows, entries.extent(0), row_map, entries,
-                               gch);
-      break;
-
-    case COLORING_DEFAULT: break;
-
-    default: break;
-  }
-
-  int num_phases = 0;
-  gc->color_graph(colors_out, num_phases);
-
-  delete gc;
-  double coloring_time = timer.seconds();
-  gch->add_to_overall_coloring_time(coloring_time);
-  gch->set_coloring_time(coloring_time);
-  gch->set_num_phases(num_phases);
-  gch->set_vertex_colors(colors_out);
+  typedef typename KernelHandle::HandleExecSpace ExecSpace;
+  typedef typename KernelHandle::HandleTempMemorySpace MemSpace;
+  typedef typename KernelHandle::HandlePersistentMemorySpace PersistentMemSpace;
+  typedef typename Kokkos::Device<ExecSpace, MemSpace> DeviceType;
+
+  typedef typename KernelHandle::const_size_type c_size_t;
+  typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
+  typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
+
+  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<
+      c_size_t, c_lno_t, c_scalar_t, ExecSpace, MemSpace, PersistentMemSpace>
+      ConstKernelHandle;
+  ConstKernelHandle tmp_handle(*handle);
+
+  typedef Kokkos::View<typename lno_row_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           lno_row_view_t_>::array_layout,
+                       DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_rowmap;
+  typedef Kokkos::View<typename lno_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           lno_nnz_view_t_>::array_layout,
+                       DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_entries;
+  KokkosGraph::Impl::
+      COLOR_D1<ConstKernelHandle, Internal_rowmap, Internal_entries>::color_d1(
+          &tmp_handle, num_rows,
+          Internal_rowmap(row_map.data(), row_map.extent(0)),
+          Internal_entries(entries.data(), entries.extent(0)));
 }
 
 template <class KernelHandle, typename lno_row_view_t_,
diff --git a/src/graph/KokkosGraph_Distance1ColorHandle.hpp b/src/graph/KokkosGraph_Distance1ColorHandle.hpp
index 7f04bfa94f..0f5d60591f 100644
--- a/src/graph/KokkosGraph_Distance1ColorHandle.hpp
+++ b/src/graph/KokkosGraph_Distance1ColorHandle.hpp
@@ -560,9 +560,9 @@ class GraphColoringHandle {
       if (color_max < colors(i)) color_max = colors(i);
     }
 
+    // max-plus semiring equivalent of "plus"
     KOKKOS_INLINE_FUNCTION
-    void join(volatile color_t &dst, const volatile color_t &src)
-        const {  // max -plus semiring equivalent of "plus"
+    void join(color_t &dst, const color_t &src) const {
       if (dst < src) {
         dst = src;
       }
diff --git a/src/graph/KokkosGraph_Distance2Color.hpp b/src/graph/KokkosGraph_Distance2Color.hpp
index 211ad42f63..dbfd1b40e9 100644
--- a/src/graph/KokkosGraph_Distance2Color.hpp
+++ b/src/graph/KokkosGraph_Distance2Color.hpp
@@ -157,8 +157,8 @@ void bipartite_color_rows(KernelHandle *handle,
     // Compute the transpose
     col_map     = TRowmap("Col map", num_columns + 1);
     col_entries = TEntries("Col entries", nnz);
-    KokkosKernels::Impl::transpose_graph<InRowmap, InEntries, TRowmap, TEntries,
-                                         TRowmap, execution_space>(
+    KokkosSparse::Impl::transpose_graph<InRowmap, InEntries, TRowmap, TEntries,
+                                        TRowmap, execution_space>(
         num_rows, num_columns, row_map, row_entries, col_map, col_entries);
   }
   InternalRowmap rowmap_internal(row_map.data(), row_map.extent(0));
@@ -235,8 +235,8 @@ void bipartite_color_columns(KernelHandle *handle,
   TRowmap col_map("Col map", num_columns + 1);
   TEntries col_entries(
       Kokkos::view_alloc(Kokkos::WithoutInitializing, "Col entries"), nnz);
-  KokkosKernels::Impl::transpose_graph<InRowmap, InEntries, TRowmap, TEntries,
-                                       TRowmap, execution_space>(
+  KokkosSparse::Impl::transpose_graph<InRowmap, InEntries, TRowmap, TEntries,
+                                      TRowmap, execution_space>(
       num_rows, num_columns, row_map, row_entries, col_map, col_entries);
   // Get unmanaged views for both graph and its transpose
   InternalRowmap colmap_internal(col_map.data(), col_map.extent(0));
diff --git a/src/graph/KokkosGraph_ExplicitCoarsening.hpp b/src/graph/KokkosGraph_ExplicitCoarsening.hpp
index 8992aa4bb8..322004c0b6 100644
--- a/src/graph/KokkosGraph_ExplicitCoarsening.hpp
+++ b/src/graph/KokkosGraph_ExplicitCoarsening.hpp
@@ -46,7 +46,7 @@
 #define KOKKOSGRAPH_EXPLICIT_COARSEN_HPP
 
 #include "KokkosGraph_ExplicitCoarsening_impl.hpp"
-#include "KokkosKernels_Sorting.hpp"
+#include "KokkosSparse_SortCrs.hpp"
 
 namespace KokkosGraph {
 namespace Experimental {
@@ -86,8 +86,8 @@ void graph_explicit_coarsen(
   if (compress) {
     coarse_rowmap_t mergedRowmap;
     coarse_entries_t mergedEntries;
-    KokkosKernels::sort_and_merge_graph<exec_space, coarse_rowmap_t,
-                                        coarse_entries_t>(
+    KokkosSparse::sort_and_merge_graph<exec_space, coarse_rowmap_t,
+                                       coarse_entries_t>(
         coarseRowmap, coarseEntries, mergedRowmap, mergedEntries);
     coarseRowmap  = mergedRowmap;
     coarseEntries = mergedEntries;
@@ -125,8 +125,8 @@ void graph_explicit_coarsen_with_inverse_map(
   if (compress) {
     coarse_rowmap_t mergedRowmap;
     coarse_entries_t mergedEntries;
-    KokkosKernels::sort_and_merge_graph<exec_space, coarse_rowmap_t,
-                                        coarse_entries_t>(
+    KokkosSparse::sort_and_merge_graph<exec_space, coarse_rowmap_t,
+                                       coarse_entries_t>(
         coarseRowmap, coarseEntries, mergedRowmap, mergedEntries);
     coarseRowmap  = mergedRowmap;
     coarseEntries = mergedEntries;
diff --git a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp
index 39e27795cc..64873708b5 100644
--- a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp
+++ b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp
@@ -411,7 +411,6 @@ class GraphColor_VB
 
     nnz_lno_t numUncolored = this->nv;
 
-    double t, total = 0.0;
     double total_time_greedy_phase               = 0.0;
     double total_time_find_conflicts             = 0.0;
     double total_time_serial_conflict_resolution = 0.0;
@@ -435,8 +434,7 @@ class GraphColor_VB
       MyExecSpace().fence();
 
       if (this->_ticToc) {
-        t = timer.seconds();
-        total += t;
+        double t = timer.seconds();
         total_time_greedy_phase += t;
         std::cout << "\tTime speculative greedy phase " << iter << " : " << t
                   << std::endl;
@@ -459,8 +457,7 @@ class GraphColor_VB
       MyExecSpace().fence();
 
       if (_ticToc) {
-        t = timer.seconds();
-        total += t;
+        double t = timer.seconds();
         total_time_find_conflicts += t;
         std::cout << "\tTime conflict detection " << iter << " : " << t
                   << std::endl;
@@ -500,8 +497,7 @@ class GraphColor_VB
       }
       MyExecSpace().fence();
       if (_ticToc) {
-        t = timer.seconds();
-        total += t;
+        double t = timer.seconds();
         total_time_serial_conflict_resolution += t;
         std::cout << "\tTime serial conflict resolution: " << t << std::endl;
       }
@@ -3118,6 +3114,88 @@ class GraphColor_EB : public GraphColor<HandleType, in_row_index_view_type_,
   };
 };
 
+template <class KernelHandle, typename lno_row_view_t_,
+          typename lno_nnz_view_t_>
+void graph_color_impl(KernelHandle *handle,
+                      typename KernelHandle::nnz_lno_t num_rows,
+                      lno_row_view_t_ row_map, lno_nnz_view_t_ entries) {
+  Kokkos::Timer timer;
+
+  typename KernelHandle::GraphColoringHandleType *gch =
+      handle->get_graph_coloring_handle();
+
+  ColoringAlgorithm algorithm = gch->get_coloring_algo_type();
+
+  typedef typename KernelHandle::GraphColoringHandleType::color_view_t
+      color_view_type;
+
+  gch->set_tictoc(handle->get_verbose());
+
+  color_view_type colors_out;
+  if (gch->get_vertex_colors().use_count() > 0) {
+    colors_out = gch->get_vertex_colors();
+  } else {
+    colors_out = color_view_type("Graph Colors", num_rows);
+  }
+
+  typedef
+      typename Impl::GraphColor<typename KernelHandle::GraphColoringHandleType,
+                                lno_row_view_t_, lno_nnz_view_t_>
+          BaseGraphColoring;
+  BaseGraphColoring *gc = NULL;
+
+  switch (algorithm) {
+    case COLORING_SERIAL:
+      gc = new BaseGraphColoring(num_rows, entries.extent(0), row_map, entries,
+                                 gch);
+      break;
+
+    case COLORING_VB:
+    case COLORING_VBBIT:
+    case COLORING_VBCS:
+      typedef typename Impl::GraphColor_VB<
+          typename KernelHandle::GraphColoringHandleType, lno_row_view_t_,
+          lno_nnz_view_t_>
+          VBGraphColoring;
+      gc = new VBGraphColoring(num_rows, entries.extent(0), row_map, entries,
+                               gch);
+      break;
+
+    case COLORING_VBD:
+    case COLORING_VBDBIT:
+      typedef typename Impl::GraphColor_VBD<
+          typename KernelHandle::GraphColoringHandleType, lno_row_view_t_,
+          lno_nnz_view_t_>
+          VBDGraphColoring;
+      gc = new VBDGraphColoring(num_rows, entries.extent(0), row_map, entries,
+                                gch);
+      break;
+
+    case COLORING_EB:
+      typedef typename Impl::GraphColor_EB<
+          typename KernelHandle::GraphColoringHandleType, lno_row_view_t_,
+          lno_nnz_view_t_>
+          EBGraphColoring;
+      gc = new EBGraphColoring(num_rows, entries.extent(0), row_map, entries,
+                               gch);
+      break;
+
+    case COLORING_DEFAULT: break;
+
+    default: break;
+  }
+
+  int num_phases = 0;
+  gc->color_graph(colors_out, num_phases);
+
+  delete gc;
+  double coloring_time = timer.seconds();
+  gch->add_to_overall_coloring_time(coloring_time);
+  gch->set_coloring_time(coloring_time);
+  gch->set_num_phases(num_phases);
+  gch->set_vertex_colors(colors_out);
+}
+
 }  // namespace Impl
 }  // namespace KokkosGraph
 
diff --git a/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp b/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp
index ed40646711..c8dddcefb8 100644
--- a/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp
+++ b/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp
@@ -51,7 +51,6 @@
 #include <type_traits>
 
 #include <Kokkos_Core.hpp>
-#include <Kokkos_UniqueToken.hpp>
 
 #include <KokkosKernels_Uniform_Initialized_MemoryPool.hpp>
 #include <KokkosKernels_HashmapAccumulator.hpp>
diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp
index 1628b715a8..195d08dc0a 100644
--- a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp
+++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp
@@ -48,6 +48,7 @@
 #include "Kokkos_Core.hpp"
 #include "Kokkos_Bitset.hpp"
 #include "KokkosKernels_Utils.hpp"
+#include "KokkosSparse_Utils.hpp"
 #include <cstdint>
 
 namespace KokkosGraph {
@@ -597,7 +598,7 @@ struct D2_MIS_FixedPriority {
         Kokkos::view_alloc(Kokkos::WithoutInitializing, "RowStatus"), numVerts);
     colStatus = status_view_t(
         Kokkos::view_alloc(Kokkos::WithoutInitializing, "ColStatus"), numVerts);
-    KokkosKernels::Impl::graph_min_max_degree<device_t, lno_t, rowmap_t>(
+    KokkosSparse::Impl::graph_min_max_degree<device_t, lno_t, rowmap_t>(
         rowmap, minDegree, maxDegree);
     // Compute row statuses
     Kokkos::parallel_for(range_pol(0, numVerts),
diff --git a/src/graph/impl/KokkosGraph_color_d1_spec.hpp b/src/graph/impl/KokkosGraph_color_d1_spec.hpp
new file mode 100644
index 0000000000..09366f2c4e
--- /dev/null
+++ b/src/graph/impl/KokkosGraph_color_d1_spec.hpp
@@ -0,0 +1,153 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOSSPARSE_IMPL_COLOR_D1_SPEC_HPP_
+#define KOKKOSSPARSE_IMPL_COLOR_D1_SPEC_HPP_
+
+#include <KokkosKernels_config.h>
+
+#include <Kokkos_Core.hpp>
+#include "KokkosKernels_Handle.hpp"
+// Include the actual functors
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+#include "KokkosGraph_Distance1Color_impl.hpp"
+#endif
+
+namespace KokkosGraph {
+namespace Impl {
+// Specialization struct which defines whether a specialization exists
+template <class KernelHandle, class size_view_t_, class lno_view_t>
+struct color_d1_eti_spec_avail {
+  enum : bool { value = false };
+};
+
+}  // namespace Impl
+}  // namespace KokkosGraph
+
+#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_AVAIL(                              \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE)                                  \
+  template <>                                                             \
+  struct color_d1_eti_spec_avail<                                         \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,          \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,              \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>> {            \
+    enum : bool { value = true };                                         \
+  };
+
+// Include the actual specialization declarations
+#include <generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_avail.hpp>
+
+namespace KokkosGraph {
+namespace Impl {
+
+// Unification layer
+/// \brief Implementation of KokkosGraph::graph_color (distance-1 greedy
+/// coloring)
+
+template <class KernelHandle, class size_view_t, class lno_view_t,
+          bool tpl_spec_avail = false,
+          bool eti_spec_avail = color_d1_eti_spec_avail<
+              KernelHandle, size_view_t, lno_view_t>::value>
+struct COLOR_D1 {
+  static void color_d1(KernelHandle *handle,
+                       typename lno_view_t::non_const_value_type num_rows,
+                       size_view_t rowmap, lno_view_t entries);
+};
+
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+
+template <class KernelHandle, class size_view_t, class lno_view_t>
+struct COLOR_D1<KernelHandle, size_view_t, lno_view_t, false,
+                KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  static void color_d1(KernelHandle *handle,
+                       typename lno_view_t::non_const_value_type num_rows,
+                       size_view_t rowmap, lno_view_t entries) {
+    KokkosGraph::Impl::graph_color_impl(handle, num_rows, rowmap, entries);
+  }
+};
+
+#endif
+
+}  // namespace Impl
+}  // namespace KokkosGraph
+
+#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL(                               \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE)                                  \
+  extern template struct COLOR_D1<                                        \
+      typename KokkosKernels::Experimental::KokkosKernelsHandle<          \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,          \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,              \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,              \
+      false, true>;
+
+#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_INST(                               \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE)                                  \
+  template struct COLOR_D1<                                               \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,          \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,              \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,              \
+      false, true>;
+
+#include <generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp>
+
+#endif
diff --git a/src/impl/generated_specializations_cpp/bspgemm_numeric/KokkosSparse_bspgemm_numeric_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/bspgemm_numeric/KokkosSparse_bspgemm_numeric_eti_spec_inst.cpp.in
new file mode 100644
index 0000000000..eb5d74232e
--- /dev/null
+++ b/src/impl/generated_specializations_cpp/bspgemm_numeric/KokkosSparse_bspgemm_numeric_eti_spec_inst.cpp.in
@@ -0,0 +1,53 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true
+#include "KokkosKernels_config.h"
+
+#include "KokkosSparse_bspgemm_numeric_spec.hpp"
+namespace KokkosSparse {
+namespace Impl {
+@SPARSE_BSPGEMM_NUMERIC_ETI_INST_BLOCK@
+  } //IMPL
+} //Kokkos
\ No newline at end of file
diff --git a/src/impl/generated_specializations_cpp/color_d1/KokkosGraph_color_d1_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/color_d1/KokkosGraph_color_d1_eti_spec_inst.cpp.in
new file mode 100644
index 0000000000..c4e4c8efe6
--- /dev/null
+++ b/src/impl/generated_specializations_cpp/color_d1/KokkosGraph_color_d1_eti_spec_inst.cpp.in
@@ -0,0 +1,53 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true
+#include "KokkosKernels_config.h"
+
+#include "KokkosGraph_color_d1_spec.hpp"
+namespace KokkosGraph {
+namespace Impl {
+@GRAPH_COLOR_D1_ETI_INST_BLOCK@
+  } //IMPL 
+} //Kokkos
diff --git a/src/impl/generated_specializations_cpp/spadd_numeric/KokkosSparse_spadd_numeric_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/spadd_numeric/KokkosSparse_spadd_numeric_eti_spec_inst.cpp.in
new file mode 100644
index 0000000000..1ffa61b1d5
--- /dev/null
+++ b/src/impl/generated_specializations_cpp/spadd_numeric/KokkosSparse_spadd_numeric_eti_spec_inst.cpp.in
@@ -0,0 +1,53 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true
+#include "KokkosKernels_config.h"
+
+#include "KokkosSparse_spadd_numeric_spec.hpp"
+namespace KokkosSparse {
+namespace Impl {
+@SPARSE_SPADD_NUMERIC_ETI_INST_BLOCK@
+  } //IMPL 
+} //Kokkos
diff --git a/src/impl/generated_specializations_cpp/spadd_symbolic/KokkosSparse_spadd_symbolic_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/spadd_symbolic/KokkosSparse_spadd_symbolic_eti_spec_inst.cpp.in
new file mode 100644
index 0000000000..aa08a1c6c7
--- /dev/null
+++ b/src/impl/generated_specializations_cpp/spadd_symbolic/KokkosSparse_spadd_symbolic_eti_spec_inst.cpp.in
@@ -0,0 +1,53 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true
+#include "KokkosKernels_config.h"
+
+#include "KokkosSparse_spadd_symbolic_spec.hpp"
+namespace KokkosSparse {
+namespace Impl {
+@SPARSE_SPADD_SYMBOLIC_ETI_INST_BLOCK@
+  } //IMPL 
+} //Kokkos
diff --git a/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_avail.hpp.in
new file mode 100644
index 0000000000..daff73b371
--- /dev/null
+++ b/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_avail.hpp.in
@@ -0,0 +1,51 @@
+#ifndef KOKKOSGRAPH_COLOR_D1_ETI_SPEC_AVAIL_HPP_
+#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_AVAIL_HPP_
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+namespace KokkosGraph {
+namespace Impl {
+@GRAPH_COLOR_D1_ETI_AVAIL_BLOCK@
+  } //IMPL 
+} //Kokkos
+#endif
diff --git a/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in
new file mode 100644
index 0000000000..8e8ca17113
--- /dev/null
+++ b/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in
@@ -0,0 +1,51 @@
+#ifndef KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL_HPP_
+#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL_HPP_
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+namespace KokkosGraph {
+namespace Impl {
+@GRAPH_COLOR_D1_ETI_DECL_BLOCK@
+  } //IMPL 
+} //Kokkos
+#endif
diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_avail.hpp.in
new file mode 100644
index 0000000000..7159192433
--- /dev/null
+++ b/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_avail.hpp.in
@@ -0,0 +1,51 @@
+#ifndef KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_AVAIL_HPP_
+#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_AVAIL_HPP_
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+namespace KokkosSparse {
+namespace Impl {
+@SPARSE_BSPGEMM_NUMERIC_ETI_AVAIL_BLOCK@
+  } //IMPL
+} //Kokkos
+#endif
\ No newline at end of file
diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in
new file mode 100644
index 0000000000..5d63c640d6
--- /dev/null
+++ b/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in
@@ -0,0 +1,51 @@
+#ifndef KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_
+#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+namespace KokkosSparse {
+namespace Impl {
+@SPARSE_BSPGEMM_NUMERIC_ETI_DECL_BLOCK@
+  } //IMPL
+} //Kokkos
+#endif
\ No newline at end of file
diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_avail.hpp.in
new file mode 100644
index 0000000000..b47c423974
--- /dev/null
+++ b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_avail.hpp.in
@@ -0,0 +1,51 @@
+#ifndef KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_AVAIL_HPP_
+#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_AVAIL_HPP_
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+namespace KokkosSparse {
+namespace Impl {
+@SPARSE_SPADD_NUMERIC_ETI_AVAIL_BLOCK@
+  } //IMPL 
+} //Kokkos
+#endif
diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in
new file mode 100644
index 0000000000..fd971bc314
--- /dev/null
+++ b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in
@@ -0,0 +1,51 @@
+#ifndef KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_DECL_HPP_
+#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_DECL_HPP_
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+namespace KokkosSparse {
+namespace Impl {
+@SPARSE_SPADD_NUMERIC_ETI_DECL_BLOCK@
+  } //IMPL 
+} //Kokkos
+#endif
diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_avail.hpp.in
new file mode 100644
index 0000000000..b38552c34a
--- /dev/null
+++ b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_avail.hpp.in
@@ -0,0 +1,51 @@
+#ifndef KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_AVAIL_HPP_
+#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_AVAIL_HPP_
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+namespace KokkosSparse {
+namespace Impl {
+@SPARSE_SPADD_SYMBOLIC_ETI_AVAIL_BLOCK@
+  } //IMPL 
+} //Kokkos
+#endif
diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in
new file mode 100644
index 0000000000..ea001cb72b
--- /dev/null
+++ b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in
@@ -0,0 +1,51 @@
+#ifndef KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL_HPP_
+#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL_HPP_
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+namespace KokkosSparse {
+namespace Impl {
+@SPARSE_SPADD_SYMBOLIC_ETI_DECL_BLOCK@
+  } //IMPL 
+} //Kokkos
+#endif
diff --git a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp
index 33ee439316..2d67c95c3e 100644
--- a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp
@@ -613,9 +613,12 @@ namespace Impl {
       KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT);                         \
       KokkosBlas::Impl::RocBlasSingleton& s =                                  \
           KokkosBlas::Impl::RocBlasSingleton::singleton();                     \
+      KOKKOS_ROCBLAS_SAFE_CALL_IMPL(                                           \
+          rocblas_set_stream(s.handle, space.hip_stream()));                   \
       KOKKOS_ROCBLAS_SAFE_CALL_IMPL(                                           \
           rocblas_dgemv(s.handle, transa, M, N, &alpha, A.data(), LDA,         \
                         X.data(), one, &beta, Y.data(), one));                 \
+      KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL));       \
       Kokkos::Profiling::popRegion();                                          \
     }                                                                          \
   };
@@ -657,9 +660,12 @@ namespace Impl {
       KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT);                         \
       KokkosBlas::Impl::RocBlasSingleton& s =                                  \
           KokkosBlas::Impl::RocBlasSingleton::singleton();                     \
+      KOKKOS_ROCBLAS_SAFE_CALL_IMPL(                                           \
+          rocblas_set_stream(s.handle, space.hip_stream()));                   \
       KOKKOS_ROCBLAS_SAFE_CALL_IMPL(                                           \
           rocblas_sgemv(s.handle, transa, M, N, &alpha, A.data(), LDA,         \
                         X.data(), one, &beta, Y.data(), one));                 \
+      KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL));       \
       Kokkos::Profiling::popRegion();                                          \
     }                                                                          \
   };
@@ -702,6 +708,8 @@ namespace Impl {
       KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT);                         \
       KokkosBlas::Impl::RocBlasSingleton& s =                                  \
           KokkosBlas::Impl::RocBlasSingleton::singleton();                     \
+      KOKKOS_ROCBLAS_SAFE_CALL_IMPL(                                           \
+          rocblas_set_stream(s.handle, space.hip_stream()));                   \
       KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgemv(                             \
           s.handle, transa, M, N,                                              \
           reinterpret_cast<const rocblas_double_complex*>(&alpha),             \
@@ -709,6 +717,7 @@ namespace Impl {
           reinterpret_cast<const rocblas_double_complex*>(X.data()), one,      \
           reinterpret_cast<const rocblas_double_complex*>(&beta),              \
           reinterpret_cast<rocblas_double_complex*>(Y.data()), one));          \
+      KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL));       \
       Kokkos::Profiling::popRegion();                                          \
     }                                                                          \
   };
@@ -751,6 +760,8 @@ namespace Impl {
       KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT);                         \
       KokkosBlas::Impl::RocBlasSingleton& s =                                  \
           KokkosBlas::Impl::RocBlasSingleton::singleton();                     \
+      KOKKOS_ROCBLAS_SAFE_CALL_IMPL(                                           \
+          rocblas_set_stream(s.handle, space.hip_stream()));                   \
       KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgemv(                             \
           s.handle, transa, M, N,                                              \
           reinterpret_cast<const rocblas_float_complex*>(&alpha),              \
@@ -758,6 +769,7 @@ namespace Impl {
           reinterpret_cast<const rocblas_float_complex*>(X.data()), one,       \
           reinterpret_cast<const rocblas_float_complex*>(&beta),               \
           reinterpret_cast<rocblas_float_complex*>(Y.data()), one));           \
+      KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL));       \
       Kokkos::Profiling::popRegion();                                          \
     }                                                                          \
   };
diff --git a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp
index 974fe76eb0..c025a1a11e 100644
--- a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp
@@ -58,7 +58,7 @@ struct trtri_tpl_spec_avail {
 #define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE)         \
   template <class ExecSpace>                                               \
   struct trtri_tpl_spec_avail<                                             \
-      Kokkos::View<int, LAYOUTA, Kokkos::HostSpace,                        \
+      Kokkos::View<int, Kokkos::LayoutRight, Kokkos::HostSpace,            \
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
       Kokkos::View<SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEMSPACE>, \
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {           \
diff --git a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp
index af9f843938..af6c186039 100644
--- a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp
@@ -55,14 +55,14 @@ namespace Impl {
 #define KOKKOSBLAS_TRTRI_BLAS_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA,     \
                                    MEM_SPACE, ETI_SPEC_AVAIL)                  \
   template <class ExecSpace>                                                   \
-  struct TRTRI<Kokkos::View<int, LAYOUTA, Kokkos::HostSpace,                   \
+  struct TRTRI<Kokkos::View<int, Kokkos::LayoutRight, Kokkos::HostSpace,       \
                             Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
                Kokkos::View<SCALAR_TYPE**, LAYOUTA,                            \
                             Kokkos::Device<ExecSpace, MEM_SPACE>,              \
                             Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
                true, ETI_SPEC_AVAIL> {                                         \
     typedef SCALAR_TYPE SCALAR;                                                \
-    typedef Kokkos::View<int, LAYOUTA, Kokkos::HostSpace,                      \
+    typedef Kokkos::View<int, Kokkos::LayoutRight, Kokkos::HostSpace,          \
                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
         RViewType;                                                             \
     typedef Kokkos::View<const SCALAR_TYPE**, LAYOUTA,                         \
@@ -104,14 +104,14 @@ namespace Impl {
 #define KOKKOSBLAS_TRTRI_BLAS_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN,   \
                                     LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL)        \
   template <class ExecSpace>                                                   \
-  struct TRTRI<Kokkos::View<int, LAYOUTA, Kokkos::HostSpace,                   \
+  struct TRTRI<Kokkos::View<int, Kokkos::LayoutRight, Kokkos::HostSpace,       \
                             Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
                Kokkos::View<SCALAR_TYPE**, LAYOUTA,                            \
                             Kokkos::Device<ExecSpace, MEM_SPACE>,              \
                             Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
                true, ETI_SPEC_AVAIL> {                                         \
     typedef SCALAR_TYPE SCALAR;                                                \
-    typedef Kokkos::View<int, LAYOUTA, Kokkos::HostSpace,                      \
+    typedef Kokkos::View<int, Kokkos::LayoutRight, Kokkos::HostSpace,          \
                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
         RViewType;                                                             \
     typedef Kokkos::View<const SCALAR_TYPE**, LAYOUTA,                         \
diff --git a/src/impl/tpls/KokkosKernels_tpl_handles_decl.hpp b/src/impl/tpls/KokkosKernels_tpl_handles_decl.hpp
index 50b2d1c2ef..aef089fd06 100644
--- a/src/impl/tpls/KokkosKernels_tpl_handles_decl.hpp
+++ b/src/impl/tpls/KokkosKernels_tpl_handles_decl.hpp
@@ -48,7 +48,7 @@
 #include "KokkosBlas_tpl_spec.hpp"
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-#include "KokkosKernels_SparseUtils_cusparse.hpp"
+#include "KokkosSparse_Utils_cusparse.hpp"
 
 namespace KokkosKernels {
 namespace Impl {
diff --git a/src/impl/tpls/KokkosKernels_tpl_handles_def.hpp b/src/impl/tpls/KokkosKernels_tpl_handles_def.hpp
index 84b5386a00..a5187986e5 100644
--- a/src/impl/tpls/KokkosKernels_tpl_handles_def.hpp
+++ b/src/impl/tpls/KokkosKernels_tpl_handles_def.hpp
@@ -69,7 +69,7 @@ CusparseSingleton& CusparseSingleton::singleton() {
 #endif
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
-#include "KokkosKernels_SparseUtils_rocsparse.hpp"
+#include "KokkosSparse_Utils_rocsparse.hpp"
 
 namespace KokkosKernels {
 namespace Impl {
diff --git a/src/impl/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp
new file mode 100644
index 0000000000..9a65bc3656
--- /dev/null
+++ b/src/impl/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp
@@ -0,0 +1,69 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSPARSE_SPADD_TPL_SPEC_AVAIL_HPP_
+#define KOKKOSPARSE_SPADD_TPL_SPEC_AVAIL_HPP_
+
+namespace KokkosSparse {
+namespace Impl {
+// Specialization struct which defines whether a specialization exists
+//
+template <class KernelHandle, class a_size_view_t, class a_lno_view_t,
+          class b_size_view_t, class b_lno_view_t, class c_size_view_t>
+struct spadd_symbolic_tpl_spec_avail {
+  enum : bool { value = false };
+};
+
+template <class KernelHandle, class a_size_view_t, class a_lno_view_t,
+          class a_scalar_view_t, class b_size_view_t, class b_lno_view_t,
+          class b_scalar_view_t, class c_size_view_t, class c_lno_view_t,
+          class c_scalar_view_t>
+struct spadd_numeric_tpl_spec_avail {
+  enum : bool { value = false };
+};
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#endif
diff --git a/src/impl/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp
new file mode 100644
index 0000000000..d9f6a19911
--- /dev/null
+++ b/src/impl/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp
@@ -0,0 +1,52 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSPARSE_SPADD_TPL_SPEC_DECL_HPP_
+#define KOKKOSPARSE_SPADD_TPL_SPEC_DECL_HPP_
+
+namespace KokkosSparse {
+namespace Impl {}
+}  // namespace KokkosSparse
+
+#endif
diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp
index cd8287b38e..57170d6eb6 100644
--- a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp
@@ -195,93 +195,49 @@ struct spmv_mv_bsrmatrix_tpl_spec_avail {
 
 // These versions of cuSPARSE require the ordinal and offset types to be the
 // same. For KokkosKernels, this means int/int only.
-
-#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(                \
-    SCALAR, ORDINAL, OFFSET, XL, YL, MEMSPACE)                                 \
-  template <>                                                                  \
-  struct spmv_mv_bsrmatrix_tpl_spec_avail<                                     \
-      const SCALAR, const ORDINAL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>,     \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET, const SCALAR*,    \
-      XL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                              \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, SCALAR*, \
-      YL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                              \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged>, true> {                         \
-    enum : bool { value = true };                                              \
+// cuSparse level 3 does not currently support LayoutRight
+#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(              \
+    SCALAR, ORDINAL, OFFSET, LAYOUT, MEMSPACE)                               \
+  template <>                                                                \
+  struct spmv_mv_bsrmatrix_tpl_spec_avail<                                   \
+      const SCALAR, const ORDINAL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>,   \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET, const SCALAR**, \
+      LAYOUT, Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,        \
+      SCALAR**, LAYOUT, Kokkos::Device<Kokkos::Cuda, MEMSPACE>,              \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, false> {                      \
+    enum : bool { value = true };                                            \
   };
 
 #if (9000 <= CUDA_VERSION)
 
 KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
-                                                       Kokkos::LayoutLeft,
                                                        Kokkos::LayoutLeft,
                                                        Kokkos::CudaSpace)
 KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
                                                        Kokkos::LayoutLeft,
-                                                       Kokkos::LayoutLeft,
-                                                       Kokkos::CudaSpace)
-KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
-                                                       Kokkos::LayoutRight,
-                                                       Kokkos::LayoutRight,
-                                                       Kokkos::CudaSpace)
-KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
-                                                       Kokkos::LayoutRight,
-                                                       Kokkos::LayoutRight,
                                                        Kokkos::CudaSpace)
 KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
-                                                       Kokkos::LayoutLeft,
                                                        Kokkos::LayoutLeft,
                                                        Kokkos::CudaUVMSpace)
 KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
-                                                       Kokkos::LayoutLeft,
                                                        Kokkos::LayoutLeft,
                                                        Kokkos::CudaUVMSpace)
-KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
-                                                       Kokkos::LayoutRight,
-                                                       Kokkos::LayoutRight,
-                                                       Kokkos::CudaUVMSpace)
-KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
-                                                       Kokkos::LayoutRight,
-                                                       Kokkos::LayoutRight,
-                                                       Kokkos::CudaUVMSpace)
 KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>,
                                                        int, int,
                                                        Kokkos::LayoutLeft,
-                                                       Kokkos::LayoutLeft,
                                                        Kokkos::CudaSpace)
 KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>,
                                                        int, int,
                                                        Kokkos::LayoutLeft,
-                                                       Kokkos::LayoutLeft,
                                                        Kokkos::CudaSpace)
 KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>,
                                                        int, int,
-                                                       Kokkos::LayoutRight,
-                                                       Kokkos::LayoutRight,
-                                                       Kokkos::CudaSpace)
-KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>,
-                                                       int, int,
-                                                       Kokkos::LayoutRight,
-                                                       Kokkos::LayoutRight,
-                                                       Kokkos::CudaSpace)
-KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>,
-                                                       int, int,
-                                                       Kokkos::LayoutLeft,
                                                        Kokkos::LayoutLeft,
                                                        Kokkos::CudaUVMSpace)
 KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>,
                                                        int, int,
                                                        Kokkos::LayoutLeft,
-                                                       Kokkos::LayoutLeft,
-                                                       Kokkos::CudaUVMSpace)
-KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>,
-                                                       int, int,
-                                                       Kokkos::LayoutRight,
-                                                       Kokkos::LayoutRight,
-                                                       Kokkos::CudaUVMSpace)
-KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>,
-                                                       int, int,
-                                                       Kokkos::LayoutRight,
-                                                       Kokkos::LayoutRight,
                                                        Kokkos::CudaUVMSpace)
 
 #endif  // CUDA/CUSPARSE >= 9.0?
diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp
index a1ae213ea9..93457f9837 100644
--- a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp
@@ -42,10 +42,11 @@
 //@HEADER
 */
 
-#ifndef KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP
-#define KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP
+#ifndef KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP
+#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP
 
 #include "KokkosKernels_Controls.hpp"
+#include "KokkosSparse_Utils_mkl.hpp"
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
 #include <mkl.h>
@@ -57,26 +58,7 @@ namespace Impl {
 #if (__INTEL_MKL__ > 2017)
 // MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv()
 
-namespace BSR {
-inline void mkl_safe_call(int errcode) {
-  if (errcode != SPARSE_STATUS_SUCCESS)
-    throw std::runtime_error("MKL returned non-success error code");
-}
-
-inline sparse_operation_t mode_kk_to_mkl(char mode_kk) {
-  switch (toupper(mode_kk)) {
-    case 'N': return SPARSE_OPERATION_NON_TRANSPOSE;
-    case 'T': return SPARSE_OPERATION_TRANSPOSE;
-    case 'H': return SPARSE_OPERATION_CONJUGATE_TRANSPOSE;
-    default:;
-  }
-  throw std::invalid_argument(
-      "Invalid mode for MKL (should be one of N, T, H)");
-}
-}  // namespace BSR
-
-using BSR::mkl_safe_call;
-using BSR::mode_kk_to_mkl;
+using KokkosSparse::Impl::mode_kk_to_mkl;
 
 inline matrix_descr getDescription() {
   matrix_descr A_descr;
@@ -91,13 +73,14 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, float alpha, float beta,
                                 const int* Aentries, const float* Avalues,
                                 const float* x, float* y) {
   sparse_matrix_t A_mkl;
-  mkl_safe_call(mkl_sparse_s_create_bsr(
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr(
       &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b,
       const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
       const_cast<int*>(Aentries), const_cast<float*>(Avalues)));
 
   matrix_descr A_descr = getDescription();
-  mkl_safe_call(mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y));
+  KOKKOSKERNELS_MKL_SAFE_CALL(
+      mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y));
 }
 
 inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha,
@@ -106,13 +89,14 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha,
                                 const double* Avalues, const double* x,
                                 double* y) {
   sparse_matrix_t A_mkl;
-  mkl_safe_call(mkl_sparse_d_create_bsr(
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr(
       &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b,
       const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
       const_cast<int*>(Aentries), const_cast<double*>(Avalues)));
 
   matrix_descr A_descr = getDescription();
-  mkl_safe_call(mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y));
+  KOKKOSKERNELS_MKL_SAFE_CALL(
+      mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y));
 }
 
 inline void spmv_block_impl_mkl(sparse_operation_t op,
@@ -123,17 +107,17 @@ inline void spmv_block_impl_mkl(sparse_operation_t op,
                                 const Kokkos::complex<float>* x,
                                 Kokkos::complex<float>* y) {
   sparse_matrix_t A_mkl;
-  mkl_safe_call(mkl_sparse_c_create_bsr(
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr(
       &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b,
       const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
       const_cast<int*>(Aentries), (MKL_Complex8*)Avalues));
 
-  MKL_Complex8& alpha_mkl = reinterpret_cast<MKL_Complex8&>(alpha);
-  MKL_Complex8& beta_mkl  = reinterpret_cast<MKL_Complex8&>(beta);
-  matrix_descr A_descr    = getDescription();
-  mkl_safe_call(mkl_sparse_c_mv(op, alpha_mkl, A_mkl, A_descr,
-                                reinterpret_cast<const MKL_Complex8*>(x),
-                                beta_mkl, reinterpret_cast<MKL_Complex8*>(y)));
+  MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()};
+  MKL_Complex8 beta_mkl{beta.real(), beta.imag()};
+  matrix_descr A_descr = getDescription();
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_mv(
+      op, alpha_mkl, A_mkl, A_descr, reinterpret_cast<const MKL_Complex8*>(x),
+      beta_mkl, reinterpret_cast<MKL_Complex8*>(y)));
 }
 
 inline void spmv_block_impl_mkl(sparse_operation_t op,
@@ -144,17 +128,17 @@ inline void spmv_block_impl_mkl(sparse_operation_t op,
                                 const Kokkos::complex<double>* x,
                                 Kokkos::complex<double>* y) {
   sparse_matrix_t A_mkl;
-  mkl_safe_call(mkl_sparse_z_create_bsr(
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr(
       &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b,
       const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
       const_cast<int*>(Aentries), (MKL_Complex16*)Avalues));
 
-  matrix_descr A_descr     = getDescription();
-  MKL_Complex16& alpha_mkl = reinterpret_cast<MKL_Complex16&>(alpha);
-  MKL_Complex16& beta_mkl  = reinterpret_cast<MKL_Complex16&>(beta);
-  mkl_safe_call(mkl_sparse_z_mv(op, alpha_mkl, A_mkl, A_descr,
-                                reinterpret_cast<const MKL_Complex16*>(x),
-                                beta_mkl, reinterpret_cast<MKL_Complex16*>(y)));
+  matrix_descr A_descr = getDescription();
+  MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()};
+  MKL_Complex16 beta_mkl{beta.real(), beta.imag()};
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_mv(
+      op, alpha_mkl, A_mkl, A_descr, reinterpret_cast<const MKL_Complex16*>(x),
+      beta_mkl, reinterpret_cast<MKL_Complex16*>(y)));
 }
 
 inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha,
@@ -163,15 +147,15 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha,
                                   const float* Avalues, const float* x,
                                   int colx, int ldx, float* y, int ldy) {
   sparse_matrix_t A_mkl;
-  mkl_safe_call(mkl_sparse_s_create_bsr(
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr(
       &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b,
       const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
       const_cast<int*>(Aentries), const_cast<float*>(Avalues)));
 
   matrix_descr A_descr = getDescription();
-  mkl_safe_call(mkl_sparse_s_mm(op, alpha, A_mkl, A_descr,
-                                SPARSE_LAYOUT_ROW_MAJOR, x, colx, ldx, beta, y,
-                                ldy));
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_mm(op, alpha, A_mkl, A_descr,
+                                              SPARSE_LAYOUT_ROW_MAJOR, x, colx,
+                                              ldx, beta, y, ldy));
 }
 
 inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha,
@@ -180,15 +164,15 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha,
                                   const double* Avalues, const double* x,
                                   int colx, int ldx, double* y, int ldy) {
   sparse_matrix_t A_mkl;
-  mkl_safe_call(mkl_sparse_d_create_bsr(
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr(
       &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b,
       const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
       const_cast<int*>(Aentries), const_cast<double*>(Avalues)));
 
   matrix_descr A_descr = getDescription();
-  mkl_safe_call(mkl_sparse_d_mm(op, alpha, A_mkl, A_descr,
-                                SPARSE_LAYOUT_ROW_MAJOR, x, colx, ldx, beta, y,
-                                ldy));
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_mm(op, alpha, A_mkl, A_descr,
+                                              SPARSE_LAYOUT_ROW_MAJOR, x, colx,
+                                              ldx, beta, y, ldy));
 }
 
 inline void spm_mv_block_impl_mkl(sparse_operation_t op,
@@ -200,15 +184,15 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op,
                                   const Kokkos::complex<float>* x, int colx,
                                   int ldx, Kokkos::complex<float>* y, int ldy) {
   sparse_matrix_t A_mkl;
-  mkl_safe_call(mkl_sparse_c_create_bsr(
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr(
       &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b,
       const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
       const_cast<int*>(Aentries), (MKL_Complex8*)Avalues));
 
-  MKL_Complex8& alpha_mkl = reinterpret_cast<MKL_Complex8&>(alpha);
-  MKL_Complex8& beta_mkl  = reinterpret_cast<MKL_Complex8&>(beta);
-  matrix_descr A_descr    = getDescription();
-  mkl_safe_call(
+  MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()};
+  MKL_Complex8 beta_mkl{beta.real(), beta.imag()};
+  matrix_descr A_descr = getDescription();
+  KOKKOSKERNELS_MKL_SAFE_CALL(
       mkl_sparse_c_mm(op, alpha_mkl, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR,
                       reinterpret_cast<const MKL_Complex8*>(x), colx, ldx,
                       beta_mkl, reinterpret_cast<MKL_Complex8*>(y), ldy));
@@ -221,15 +205,15 @@ inline void spm_mv_block_impl_mkl(
     const Kokkos::complex<double>* x, int colx, int ldx,
     Kokkos::complex<double>* y, int ldy) {
   sparse_matrix_t A_mkl;
-  mkl_safe_call(mkl_sparse_z_create_bsr(
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr(
       &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b,
       const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
       const_cast<int*>(Aentries), (MKL_Complex16*)Avalues));
 
-  matrix_descr A_descr     = getDescription();
-  MKL_Complex16& alpha_mkl = reinterpret_cast<MKL_Complex16&>(alpha);
-  MKL_Complex16& beta_mkl  = reinterpret_cast<MKL_Complex16&>(beta);
-  mkl_safe_call(
+  matrix_descr A_descr = getDescription();
+  MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()};
+  MKL_Complex16 beta_mkl{beta.real(), beta.imag()};
+  KOKKOSKERNELS_MKL_SAFE_CALL(
       mkl_sparse_z_mm(op, alpha_mkl, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR,
                       reinterpret_cast<const MKL_Complex16*>(x), colx, ldx,
                       beta_mkl, reinterpret_cast<MKL_Complex16*>(y), ldy));
@@ -470,7 +454,7 @@ KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex<double>, Kokkos::OpenMP,
 // cuSPARSE
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 #include "cusparse.h"
-#include "KokkosKernels_SparseUtils_cusparse.hpp"
+#include "KokkosSparse_Utils_cusparse.hpp"
 
 //
 // From  https://docs.nvidia.com/cuda/cusparse/index.html#bsrmv
@@ -503,7 +487,7 @@ void spmv_block_impl_cusparse(
     default: {
       std::cerr << "Mode " << mode << " invalid for cusparse[*]bsrmv.\n";
       throw std::invalid_argument("Invalid mode");
-    } break;
+    }
   }
 
 #if (9000 <= CUDA_VERSION)
@@ -578,8 +562,24 @@ void spmv_block_impl_cusparse(
 // - Only blockDim > 1 is supported
 // - Only CUSPARSE_OPERATION_NON_TRANSPOSE is supported
 // - Only CUSPARSE_MATRIX_TYPE_GENERAL is supported.
+// - Only LayoutLeft for X and Y:
+//   for X,Y LayoutLeft we want cuSparse to do
+//   C = A * B + C
+//   and for X,Y LayoutRight we want cuSparse to do
+//   trans(C) = A * trans(B) + trans(C)
+//   -> t(t(C)) = t(A * t(B)) + t(t(C))
+//   ->       C = t(t(B)) * t(A) + C
+//   ->       C = B * t(A) + C
+//   This is impossible in cuSparse without explicitly transposing C,
+//   so we just do not support LayoutRight in cuSparse TPL now
 //
-template <class AMatrix, class XVector, class YVector>
+template <
+    class AMatrix, class XVector, class YVector,
+    std::enable_if_t<std::is_same<Kokkos::LayoutLeft,
+                                  typename XVector::array_layout>::value &&
+                         std::is_same<Kokkos::LayoutLeft,
+                                      typename YVector::array_layout>::value,
+                     bool> = true>
 void spm_mv_block_impl_cusparse(
     const KokkosKernels::Experimental::Controls& controls, const char mode[],
     typename YVector::non_const_value_type const& alpha, const AMatrix& A,
@@ -599,12 +599,14 @@ void spm_mv_block_impl_cusparse(
     default: {
       std::cerr << "Mode " << mode << " invalid for cusparse[*]bsrmv.\n";
       throw std::invalid_argument("Invalid mode");
-    } break;
+    }
   }
 
   int colx = static_cast<int>(x.extent(1));
-  int ldx  = static_cast<int>(x.stride_1());
-  int ldy  = static_cast<int>(y.stride_1());
+
+  // ldx and ldy should be the leading dimension of X,Y respectively
+  const int ldx = static_cast<int>(x.extent(0));
+  const int ldy = static_cast<int>(y.extent(0));
 
 #if (9000 <= CUDA_VERSION)
 
@@ -761,29 +763,31 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int, int, Kokkos::LayoutLeft,
 KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int, int,
                            Kokkos::LayoutRight, Kokkos::CudaUVMSpace,
                            KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-#endif
+#endif  // 9000 <= CUDA_VERSION
 
 #undef KOKKOSSPARSE_SPMV_CUSPARSE
 
-#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE,  \
-                                      COMPILE_LIBRARY)                         \
+// cuSparse TPL does not support LayoutRight for this operation
+// only specialize for LayoutLeft
+#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, SPACE,          \
+                                      ETI_AVAIL)                               \
   template <>                                                                  \
   struct SPMV_MV_BSRMATRIX<                                                    \
       SCALAR const, ORDINAL const, Kokkos::Device<Kokkos::Cuda, SPACE>,        \
       Kokkos::MemoryTraits<Kokkos::Unmanaged>, OFFSET const, SCALAR const**,   \
-      LAYOUT, Kokkos::Device<Kokkos::Cuda, SPACE>,                             \
+      Kokkos::LayoutLeft, Kokkos::Device<Kokkos::Cuda, SPACE>,                 \
       Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,          \
-      SCALAR**, LAYOUT, Kokkos::Device<Kokkos::Cuda, SPACE>,                   \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged>, true, true, COMPILE_LIBRARY> {  \
+      SCALAR**, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::Cuda, SPACE>,       \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, false, true, ETI_AVAIL> {       \
     using device_type       = Kokkos::Device<Kokkos::Cuda, SPACE>;             \
     using memory_trait_type = Kokkos::MemoryTraits<Kokkos::Unmanaged>;         \
     using AMatrix = BsrMatrix<SCALAR const, ORDINAL const, device_type,        \
                               memory_trait_type, OFFSET const>;                \
     using XVector = Kokkos::View<                                              \
-        SCALAR const**, LAYOUT, device_type,                                   \
+        SCALAR const**, Kokkos::LayoutLeft, device_type,                       \
         Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;       \
-    using YVector =                                                            \
-        Kokkos::View<SCALAR**, LAYOUT, device_type, memory_trait_type>;        \
+    using YVector  = Kokkos::View<SCALAR**, Kokkos::LayoutLeft, device_type,   \
+                                 memory_trait_type>;                          \
     using Controls = KokkosKernels::Experimental::Controls;                    \
                                                                                \
     using coefficient_type = typename YVector::non_const_value_type;           \
@@ -802,55 +806,32 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int, int,
   };
 
 #if (9000 <= CUDA_VERSION)
-KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft,
-                              Kokkos::CudaSpace,
-                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight,
-                              Kokkos::CudaSpace,
-                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft,
-                              Kokkos::CudaSpace,
-                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight,
-                              Kokkos::CudaSpace,
-                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaSpace, true)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaSpace, false)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaSpace, true)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaSpace, false)
 KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<double>, int, int,
-                              Kokkos::LayoutLeft, Kokkos::CudaSpace,
-                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+                              Kokkos::CudaSpace, true)
 KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<double>, int, int,
-                              Kokkos::LayoutRight, Kokkos::CudaSpace,
-                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+                              Kokkos::CudaSpace, false)
 KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<float>, int, int,
-                              Kokkos::LayoutLeft, Kokkos::CudaSpace,
-                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+                              Kokkos::CudaSpace, true)
 KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<float>, int, int,
-                              Kokkos::LayoutRight, Kokkos::CudaSpace,
-                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft,
-                              Kokkos::CudaUVMSpace,
-                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight,
-                              Kokkos::CudaUVMSpace,
-                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft,
-                              Kokkos::CudaUVMSpace,
-                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight,
-                              Kokkos::CudaUVMSpace,
-                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+                              Kokkos::CudaSpace, false)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaUVMSpace, true)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaUVMSpace, false)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaUVMSpace, true)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaUVMSpace, false)
 KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<double>, int, int,
-                              Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
-                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+                              Kokkos::CudaUVMSpace, true)
 KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<double>, int, int,
-                              Kokkos::LayoutRight, Kokkos::CudaUVMSpace,
-                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+                              Kokkos::CudaUVMSpace, false)
 KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<float>, int, int,
-                              Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
-                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+                              Kokkos::CudaUVMSpace, true)
 KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<float>, int, int,
-                              Kokkos::LayoutRight, Kokkos::CudaUVMSpace,
-                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-#endif
+                              Kokkos::CudaUVMSpace, false)
+
+#endif  // 9000 <= CUDA_VERSION
 
 #undef KOKKOSSPARSE_SPMV_MV_CUSPARSE
 
@@ -858,6 +839,6 @@ KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<float>, int, int,
 }  // namespace Experimental
 }  // namespace KokkosSparse
 
-#endif
+#endif  // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 
-#endif  // KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP
+#endif  // KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP
diff --git a/src/impl/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp
new file mode 100644
index 0000000000..ef23f6ec9a
--- /dev/null
+++ b/src/impl/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp
@@ -0,0 +1,175 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSPARSE_SPMV_MV_TPL_SPEC_AVAIL_HPP_
+#define KOKKOSPARSE_SPMV_MV_TPL_SPEC_AVAIL_HPP_
+
+namespace KokkosSparse {
+namespace Impl {
+
+// Specialization struct which defines whether a specialization exists
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM,
+          const bool integerScalarType =
+              std::is_integral<typename std::decay<AT>::type>::value>
+struct spmv_mv_tpl_spec_avail {
+  enum : bool { value = false };
+};
+
+#define KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(SCALAR, ORDINAL, OFFSET, \
+                                                     XL, YL, MEMSPACE)        \
+  template <>                                                                 \
+  struct spmv_mv_tpl_spec_avail<                                              \
+      const SCALAR, const ORDINAL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>,    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET, const SCALAR**,  \
+      XL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                             \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,         \
+      SCALAR**, YL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                   \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> > {                             \
+    enum : bool { value = true };                                             \
+  };
+
+/* CUSPARSE_VERSION 10300 and lower seem to have a bug in cusparseSpMM
+non-transpose that produces incorrect result. This is cusparse distributed with
+CUDA 10.1.243. The bug seems to be resolved by CUSPARSE 10301 (present by
+CUDA 10.2.89) */
+#if defined(CUSPARSE_VERSION) && (10301 <= CUSPARSE_VERSION)
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
+                                             Kokkos::LayoutRight,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaSpace)
+
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
+                                             Kokkos::LayoutRight,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaSpace)
+
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int, int,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int, int,
+                                             Kokkos::LayoutRight,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaSpace)
+
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int, int,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int, int,
+                                             Kokkos::LayoutRight,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaSpace)
+
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
+                                             Kokkos::LayoutRight,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaUVMSpace)
+
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
+                                             Kokkos::LayoutRight,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaUVMSpace)
+
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int, int,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int, int,
+                                             Kokkos::LayoutRight,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaUVMSpace)
+
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int, int,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int, int,
+                                             Kokkos::LayoutRight,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaUVMSpace)
+
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::Experimental::half_t, int,
+                                             int, Kokkos::LayoutLeft,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::Experimental::half_t, int,
+                                             int, Kokkos::LayoutRight,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaSpace)
+
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::Experimental::half_t, int,
+                                             int, Kokkos::LayoutLeft,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::Experimental::half_t, int,
+                                             int, Kokkos::LayoutRight,
+                                             Kokkos::LayoutLeft,
+                                             Kokkos::CudaUVMSpace)
+
+#endif
+#endif  // defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION)
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#endif  // KOKKOSPARSE_SPMV_MV_TPL_SPEC_AVAIL_HPP_
diff --git a/src/impl/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp
new file mode 100644
index 0000000000..0bfeec3288
--- /dev/null
+++ b/src/impl/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp
@@ -0,0 +1,336 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSPARSE_SPMV_MV_TPL_SPEC_DECL_HPP_
+#define KOKKOSPARSE_SPMV_MV_TPL_SPEC_DECL_HPP_
+
+#include "KokkosKernels_Controls.hpp"
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+
+/* CUSPARSE_VERSION < 10301 either doesn't have cusparseSpMM
+   or the non-tranpose version produces incorrect results.
+*/
+#if defined(CUSPARSE_VERSION) && (10301 <= CUSPARSE_VERSION)
+#include "cusparse.h"
+#include "KokkosSparse_Utils_cusparse.hpp"
+
+namespace KokkosSparse {
+namespace Impl {
+
+/* Derive a compute type for various operand types.
+   cusparseSpMM does not always allow the same compute type as operand types
+   This should be consistent with the allowed operand types for cusparseSpMM,
+   as needed for TPL availability. Current definition does not comprehensively
+   cover all cusparseSpMM options.
+
+   cuSparse 11.5.1+ does not support uniform precision for FP16
+   Otherwise, uniform precision is supported
+*/
+template <typename AScalar, typename XScalar = AScalar,
+          typename YScalar = AScalar>
+cudaDataType compute_type() {
+  return cuda_data_type_from<AScalar>();
+}
+#if CUSPARSE_VERSION >= 11501
+template <>
+inline cudaDataType compute_type<Kokkos::Experimental::half_t>() {
+  return CUDA_R_32F;
+}
+#else
+template <>
+inline cudaDataType compute_type<Kokkos::Experimental::half_t>() {
+  return cuda_data_type_from<Kokkos::Experimental::half_t>();
+}
+#endif
+
+/*! \brief convert a 2D view to a cusparseDnMatDescr_t
+
+*/
+template <typename ViewType, std::enable_if_t<ViewType::rank == 2, bool> = true>
+cusparseDnMatDescr_t make_cusparse_dn_mat_descr_t(ViewType &view) {
+  const int64_t rows = view.extent(0);
+  const int64_t cols = view.extent(1);
+  const int64_t ld   = view.extent(0);
+
+  // cusparseCreateCsr notes it is safe to const_cast this away for input
+  // pointers to a descriptor as long as that descriptor is not an output
+  // parameter
+  void *values =
+      const_cast<typename ViewType::non_const_value_type *>(view.data());
+
+  cudaDataType valueType =
+      cuda_data_type_from<typename ViewType::non_const_value_type>();
+
+  // col-major is the only supported order in 10301
+  // ignore the layout of the provided view, and expect the caller to
+  // fix with a transpose operation, if possible.
+  // This should be revisited once cusparse supports row-major dense matrices
+  const cusparseOrder_t order = CUSPARSE_ORDER_COL;
+
+  cusparseDnMatDescr_t descr;
+  KOKKOS_CUSPARSE_SAFE_CALL(
+      cusparseCreateDnMat(&descr, rows, cols, ld, values, valueType, order));
+
+  return descr;
+}
+
+template <class AMatrix, class XVector, class YVector>
+void spmv_mv_cusparse(const KokkosKernels::Experimental::Controls &controls,
+                      const char mode[],
+                      typename YVector::non_const_value_type const &alpha,
+                      const AMatrix &A, const XVector &x,
+                      typename YVector::non_const_value_type const &beta,
+                      const YVector &y) {
+  static_assert(XVector::rank == 2,
+                "should only be instantiated for multivector");
+  static_assert(YVector::rank == 2,
+                "should only be instantiated for multivector");
+
+  using offset_type  = typename AMatrix::non_const_size_type;
+  using entry_type   = typename AMatrix::non_const_ordinal_type;
+  using value_type   = typename AMatrix::non_const_value_type;
+  using x_value_type = typename XVector::non_const_value_type;
+  using y_value_type = typename YVector::non_const_value_type;
+
+  /* initialize cusparse library */
+  cusparseHandle_t cusparseHandle = controls.getCusparseHandle();
+
+  /* Set the operation mode */
+  cusparseOperation_t opA;
+  switch (toupper(mode[0])) {
+    case 'N': opA = CUSPARSE_OPERATION_NON_TRANSPOSE; break;
+    case 'T': opA = CUSPARSE_OPERATION_TRANSPOSE; break;
+    case 'H': opA = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE; break;
+    default: {
+      std::cerr << "Mode " << mode << " invalid for cuSPARSE SpMV MV.\n";
+      throw std::invalid_argument("Invalid mode");
+    }
+  }
+
+  /* Check that cusparse can handle the types of the input Kokkos::CrsMatrix */
+  const cusparseIndexType_t myCusparseOffsetType =
+      cusparse_index_type_t_from<offset_type>();
+  const cusparseIndexType_t myCusparseEntryType =
+      cusparse_index_type_t_from<entry_type>();
+  const cudaDataType aCusparseType = cuda_data_type_from<value_type>();
+
+  /* create matrix */
+  cusparseSpMatDescr_t A_cusparse;
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(
+      &A_cusparse, A.numRows(), A.numCols(), A.nnz(),
+      (void *)A.graph.row_map.data(), (void *)A.graph.entries.data(),
+      (void *)A.values.data(), myCusparseOffsetType, myCusparseEntryType,
+      CUSPARSE_INDEX_BASE_ZERO, aCusparseType));
+
+  /* create lhs and rhs
+     NOTE: The descriptions always say vecX and vecY are column-major cusparse
+     order. For CUSPARSE_VERSION 10301 this is the only supported ordering. if X
+     is not LayoutLeft, we can fix with a transpose. If cusparseSpMM ever
+     supports row-major dense matrices, this logic will have to be reworked */
+  constexpr bool xIsLL =
+      std::is_same<typename XVector::array_layout, Kokkos::LayoutLeft>::value;
+  constexpr bool xIsLR =
+      std::is_same<typename XVector::array_layout, Kokkos::LayoutRight>::value;
+  static_assert(xIsLL || xIsLR, "X multivector was not LL or LR (TPL error)");
+  cusparseDnMatDescr_t vecX = make_cusparse_dn_mat_descr_t(x);
+  cusparseDnMatDescr_t vecY = make_cusparse_dn_mat_descr_t(y);
+  cusparseOperation_t opB =
+      xIsLL ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
+
+  const cusparseSpMMAlg_t alg = CUSPARSE_MM_ALG_DEFAULT;
+
+  // the precision of the SpMV
+  const cudaDataType computeType =
+      compute_type<value_type, x_value_type, y_value_type>();
+
+  size_t bufferSize = 0;
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMM_bufferSize(
+      cusparseHandle, opA, opB, &alpha, A_cusparse, vecX, &beta, vecY,
+      computeType, alg, &bufferSize));
+
+  void *dBuffer = nullptr;
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&dBuffer, bufferSize));
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMM(cusparseHandle, opA, opB, &alpha,
+                                         A_cusparse, vecX, &beta, vecY,
+                                         computeType, alg, dBuffer));
+
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(dBuffer));
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnMat(vecX));
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnMat(vecY));
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(A_cusparse));
+}
+
+#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, XL, YL, SPACE,  \
+                                      COMPILE_LIBRARY)                         \
+  template <>                                                                  \
+  struct SPMV_MV<                                                              \
+      SCALAR const, ORDINAL const, Kokkos::Device<Kokkos::Cuda, SPACE>,        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, OFFSET const, SCALAR const **,  \
+      XL, Kokkos::Device<Kokkos::Cuda, SPACE>,                                 \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,          \
+      SCALAR **, YL, Kokkos::Device<Kokkos::Cuda, SPACE>,                      \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, false, true, COMPILE_LIBRARY> { \
+    using device_type       = Kokkos::Device<Kokkos::Cuda, SPACE>;             \
+    using memory_trait_type = Kokkos::MemoryTraits<Kokkos::Unmanaged>;         \
+    using AMatrix = CrsMatrix<SCALAR const, ORDINAL const, device_type,        \
+                              memory_trait_type, OFFSET const>;                \
+    using XVector = Kokkos::View<                                              \
+        SCALAR const **, XL, device_type,                                      \
+        Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;       \
+    using YVector =                                                            \
+        Kokkos::View<SCALAR **, YL, device_type, memory_trait_type>;           \
+                                                                               \
+    using coefficient_type = typename YVector::non_const_value_type;           \
+                                                                               \
+    using Controls = KokkosKernels::Experimental::Controls;                    \
+    static void spmv_mv(const Controls &controls, const char mode[],           \
+                        const coefficient_type &alpha, const AMatrix &A,       \
+                        const XVector &x, const coefficient_type &beta,        \
+                        const YVector &y) {                                    \
+      std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," +                 \
+                          Kokkos::ArithTraits<SCALAR>::name() + "]";           \
+      Kokkos::Profiling::pushRegion(label);                                    \
+      spmv_mv_cusparse(controls, mode, alpha, A, x, beta, y);                  \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+/* cusparseSpMM with following restrictions
+ column-major ordering for Y
+ col-major or row-major for X (see note below)
+ 32-bit indices for matrix A */
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft,
+                              Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight,
+                              Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft,
+                              Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight,
+                              Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<double>, int, int,
+                              Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                              Kokkos::CudaSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<double>, int, int,
+                              Kokkos::LayoutRight, Kokkos::LayoutLeft,
+                              Kokkos::CudaSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<float>, int, int,
+                              Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                              Kokkos::CudaSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<float>, int, int,
+                              Kokkos::LayoutRight, Kokkos::LayoutLeft,
+                              Kokkos::CudaSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft,
+                              Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight,
+                              Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft,
+                              Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight,
+                              Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<double>, int, int,
+                              Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                              Kokkos::CudaUVMSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<double>, int, int,
+                              Kokkos::LayoutRight, Kokkos::LayoutLeft,
+                              Kokkos::CudaUVMSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<float>, int, int,
+                              Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                              Kokkos::CudaUVMSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<float>, int, int,
+                              Kokkos::LayoutRight, Kokkos::LayoutLeft,
+                              Kokkos::CudaUVMSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::Experimental::half_t, int, int,
+                              Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                              Kokkos::CudaSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::Experimental::half_t, int, int,
+                              Kokkos::LayoutRight, Kokkos::LayoutLeft,
+                              Kokkos::CudaSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::Experimental::half_t, int, int,
+                              Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                              Kokkos::CudaUVMSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::Experimental::half_t, int, int,
+                              Kokkos::LayoutRight, Kokkos::LayoutLeft,
+                              Kokkos::CudaUVMSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+
+#endif
+
+#undef KOKKOSSPARSE_SPMV_MV_CUSPARSE
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+#endif  // defined(CUSPARSE_VERSION) && (10301 <= CUSPARSE_VERSION)
+#endif  // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+
+#endif  // KOKKOSPARSE_SPMV_MV_TPL_SPEC_DECL_HPP_
\ No newline at end of file
diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp
index fd42797d71..a91996361b 100644
--- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp
@@ -201,6 +201,8 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int64_t,
 #endif  // CUDA/CUSPARSE >= 9.0?
 #endif  // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 
+#undef KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE
+
 #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE)
 
 #define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(SCALAR, LAYOUT)             \
@@ -265,15 +267,6 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex<double>, Kokkos::OpenMP)
 
 #endif  // KOKKOSKERNELS_ENABLE_TPL_MKL
 
-// Specialization struct which defines whether a specialization exists
-template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
-          class XD, class XM, class YT, class YL, class YD, class YM,
-          const bool integerScalarType =
-              std::is_integral<typename std::decay<AT>::type>::value>
-struct spmv_mv_tpl_spec_avail {
-  enum : bool { value = false };
-};
-
 }  // namespace Impl
 }  // namespace KokkosSparse
 
diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp
index 17a72b2ad3..b4c73a12ff 100644
--- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp
@@ -50,7 +50,7 @@
 // cuSPARSE
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 #include "cusparse.h"
-#include "KokkosKernels_SparseUtils_cusparse.hpp"
+#include "KokkosSparse_Utils_cusparse.hpp"
 
 namespace KokkosSparse {
 namespace Impl {
@@ -86,25 +86,11 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls,
 #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION)
 
   /* Check that cusparse can handle the types of the input Kokkos::CrsMatrix */
-  cusparseIndexType_t myCusparseOffsetType;
-  if (std::is_same<offset_type, int>::value)
-    myCusparseOffsetType = CUSPARSE_INDEX_32I;
-  else if (std::is_same<offset_type, int64_t>::value ||
-           std::is_same<offset_type, size_t>::value)
-    myCusparseOffsetType = CUSPARSE_INDEX_64I;
-  else
-    throw std::logic_error(
-        "Offset type of CrsMatrix isn't supported by cuSPARSE, yet TPL layer "
-        "says it is");
-  cusparseIndexType_t myCusparseEntryType;
-  if (std::is_same<entry_type, int>::value)
-    myCusparseEntryType = CUSPARSE_INDEX_32I;
-  else if (std::is_same<entry_type, int64_t>::value)
-    myCusparseEntryType = CUSPARSE_INDEX_64I;
-  else
-    throw std::logic_error(
-        "Ordinal (entry) type of CrsMatrix isn't supported by cuSPARSE, yet "
-        "TPL layer says it is");
+  const cusparseIndexType_t myCusparseOffsetType =
+      cusparse_index_type_t_from<offset_type>();
+  const cusparseIndexType_t myCusparseEntryType =
+      cusparse_index_type_t_from<entry_type>();
+
   cudaDataType myCudaDataType;
   if (std::is_same<value_type, float>::value)
     myCudaDataType = CUDA_R_32F;
@@ -134,15 +120,27 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls,
   KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(
       &vecY, y.extent_int(0), (void*)y.data(), myCudaDataType));
 
-  size_t bufferSize     = 0;
-  void* dBuffer         = NULL;
+  size_t bufferSize = 0;
+  void* dBuffer     = NULL;
+#if CUSPARSE_VERSION >= 11201
+  cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT;
+#else
   cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT;
+#endif
   if (controls.isParameter("algorithm")) {
     const std::string algName = controls.getParameter("algorithm");
     if (algName == "default")
+#if CUSPARSE_VERSION >= 11201
+      alg = CUSPARSE_SPMV_ALG_DEFAULT;
+#else
       alg = CUSPARSE_MV_ALG_DEFAULT;
+#endif
     else if (algName == "merge")
+#if CUSPARSE_VERSION >= 11201
+      alg = CUSPARSE_SPMV_CSR_ALG2;
+#else
       alg = CUSPARSE_CSRMV_ALG2;
+#endif
   }
   KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize(
       cusparseHandle, myCusparseOperation, &alpha, A_cusparse, vecX, &beta,
@@ -361,8 +359,8 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int64_t, size_t,
 KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int64_t, size_t,
                            Kokkos::LayoutRight, Kokkos::CudaUVMSpace,
                            KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-#endif
-#endif
+#endif  // defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION)
+#endif  // 9000 <= CUDA_VERSION
 
 #undef KOKKOSSPARSE_SPMV_CUSPARSE
 
@@ -373,7 +371,7 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int64_t, size_t,
 // rocSPARSE
 #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE)
 #include <rocsparse.h>
-#include "KokkosKernels_SparseUtils_rocsparse.hpp"
+#include "KokkosSparse_Utils_rocsparse.hpp"
 
 namespace KokkosSparse {
 namespace Impl {
@@ -530,6 +528,7 @@ KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex<float>, Kokkos::LayoutRight,
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
 #include <mkl.h>
+#include "KokkosSparse_Utils_mkl.hpp"
 
 namespace KokkosSparse {
 namespace Impl {
@@ -537,27 +536,6 @@ namespace Impl {
 #if (__INTEL_MKL__ > 2017)
 // MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv()
 
-// Note 12/03/21 - lbv:
-// mkl_safe_call and mode_kk_to_mkl should
-// be moved to some sparse or mkl utility
-// header. It is likely that these will be
-// reused for other kernels.
-inline void mkl_safe_call(int errcode) {
-  if (errcode != SPARSE_STATUS_SUCCESS)
-    throw std::runtime_error("MKL returned non-success error code");
-}
-
-inline sparse_operation_t mode_kk_to_mkl(char mode_kk) {
-  switch (toupper(mode_kk)) {
-    case 'N': return SPARSE_OPERATION_NON_TRANSPOSE;
-    case 'T': return SPARSE_OPERATION_TRANSPOSE;
-    case 'H': return SPARSE_OPERATION_CONJUGATE_TRANSPOSE;
-    default:;
-  }
-  throw std::invalid_argument(
-      "Invalid mode for MKL (should be one of N, T, H)");
-}
-
 inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, int m,
                      int n, const int* Arowptrs, const int* Aentries,
                      const float* Avalues, const float* x, float* y) {
@@ -566,11 +544,12 @@ inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, int m,
   A_descr.type = SPARSE_MATRIX_TYPE_GENERAL;
   A_descr.mode = SPARSE_FILL_MODE_FULL;
   A_descr.diag = SPARSE_DIAG_NON_UNIT;
-  mkl_safe_call(mkl_sparse_s_create_csr(
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_csr(
       &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast<int*>(Arowptrs),
       const_cast<int*>(Arowptrs + 1), const_cast<int*>(Aentries),
       const_cast<float*>(Avalues)));
-  mkl_safe_call(mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y));
+  KOKKOSKERNELS_MKL_SAFE_CALL(
+      mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y));
 }
 
 inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, int m,
@@ -581,11 +560,12 @@ inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, int m,
   A_descr.type = SPARSE_MATRIX_TYPE_GENERAL;
   A_descr.mode = SPARSE_FILL_MODE_FULL;
   A_descr.diag = SPARSE_DIAG_NON_UNIT;
-  mkl_safe_call(mkl_sparse_d_create_csr(
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr(
       &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast<int*>(Arowptrs),
       const_cast<int*>(Arowptrs + 1), const_cast<int*>(Aentries),
       const_cast<double*>(Avalues)));
-  mkl_safe_call(mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y));
+  KOKKOSKERNELS_MKL_SAFE_CALL(
+      mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y));
 }
 
 inline void spmv_mkl(sparse_operation_t op, Kokkos::complex<float> alpha,
@@ -599,15 +579,15 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex<float> alpha,
   A_descr.type = SPARSE_MATRIX_TYPE_GENERAL;
   A_descr.mode = SPARSE_FILL_MODE_FULL;
   A_descr.diag = SPARSE_DIAG_NON_UNIT;
-  mkl_safe_call(mkl_sparse_c_create_csr(
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_csr(
       &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast<int*>(Arowptrs),
       const_cast<int*>(Arowptrs + 1), const_cast<int*>(Aentries),
       (MKL_Complex8*)Avalues));
-  MKL_Complex8& alpha_mkl = reinterpret_cast<MKL_Complex8&>(alpha);
-  MKL_Complex8& beta_mkl  = reinterpret_cast<MKL_Complex8&>(beta);
-  mkl_safe_call(mkl_sparse_c_mv(op, alpha_mkl, A_mkl, A_descr,
-                                reinterpret_cast<const MKL_Complex8*>(x),
-                                beta_mkl, reinterpret_cast<MKL_Complex8*>(y)));
+  MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()};
+  MKL_Complex8 beta_mkl{beta.real(), beta.imag()};
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_mv(
+      op, alpha_mkl, A_mkl, A_descr, reinterpret_cast<const MKL_Complex8*>(x),
+      beta_mkl, reinterpret_cast<MKL_Complex8*>(y)));
 }
 
 inline void spmv_mkl(sparse_operation_t op, Kokkos::complex<double> alpha,
@@ -621,15 +601,15 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex<double> alpha,
   A_descr.type = SPARSE_MATRIX_TYPE_GENERAL;
   A_descr.mode = SPARSE_FILL_MODE_FULL;
   A_descr.diag = SPARSE_DIAG_NON_UNIT;
-  mkl_safe_call(mkl_sparse_z_create_csr(
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_csr(
       &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast<int*>(Arowptrs),
       const_cast<int*>(Arowptrs + 1), const_cast<int*>(Aentries),
       (MKL_Complex16*)Avalues));
-  MKL_Complex16& alpha_mkl = reinterpret_cast<MKL_Complex16&>(alpha);
-  MKL_Complex16& beta_mkl  = reinterpret_cast<MKL_Complex16&>(beta);
-  mkl_safe_call(mkl_sparse_z_mv(op, alpha_mkl, A_mkl, A_descr,
-                                reinterpret_cast<const MKL_Complex16*>(x),
-                                beta_mkl, reinterpret_cast<MKL_Complex16*>(y)));
+  MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()};
+  MKL_Complex16 beta_mkl{beta.real(), beta.imag()};
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_mv(
+      op, alpha_mkl, A_mkl, A_descr, reinterpret_cast<const MKL_Complex16*>(x),
+      beta_mkl, reinterpret_cast<MKL_Complex16*>(y)));
 }
 
 #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY)              \
diff --git a/src/common/KokkosKernels_Controls.hpp b/src/sparse/KokkosKernels_Controls.hpp
similarity index 88%
rename from src/common/KokkosKernels_Controls.hpp
rename to src/sparse/KokkosKernels_Controls.hpp
index c5a47a24b3..aabe0069be 100644
--- a/src/common/KokkosKernels_Controls.hpp
+++ b/src/sparse/KokkosKernels_Controls.hpp
@@ -81,28 +81,23 @@ class Controls {
 
   // check if a parameter is already set
   bool isParameter(const std::string& name) const {
-    bool return_value = false;
-
-    auto search = kernel_parameters.find(name);
-    if (search != kernel_parameters.end()) {
-      return_value = true;
-    }
-
-    return return_value;
+    return kernel_parameters.end() != kernel_parameters.find(name);
   }
 
-  // retrieve the value associated with a parameter if it is already set
-  std::string getParameter(const std::string& name) const {
+  /// \brief get the value associated with \c name, or \c default if not present
+  ///
+  /// \param name the name of the parameter to retrieve
+  /// \param orUnset (default \c "" ) the value to return if \c name is not set
+  std::string getParameter(const std::string& name,
+                           const std::string& orUnset = "") const {
     auto search = kernel_parameters.find(name);
-    std::string value;
-    if (search == kernel_parameters.end()) {
-      std::cout << "Parameter " << name
-                << " was not found in the list of parameters!" << std::endl;
-      value = "";
+    if (kernel_parameters.end() == search) {
+      std::cerr << "WARNING: Controls::getParameter for name \"" << name
+                << "\" was unset" << std::endl;
+      return orUnset;
     } else {
-      value = search->second;
+      return search->second;
     }
-    return value;
   }
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
diff --git a/src/common/KokkosKernels_Handle.hpp b/src/sparse/KokkosKernels_Handle.hpp
similarity index 99%
rename from src/common/KokkosKernels_Handle.hpp
rename to src/sparse/KokkosKernels_Handle.hpp
index 0e9ba8dc4e..69a74c3e5d 100644
--- a/src/common/KokkosKernels_Handle.hpp
+++ b/src/sparse/KokkosKernels_Handle.hpp
@@ -181,6 +181,7 @@ class KokkosKernelsHandle {
     this->gs_sptrsvUHandle = right_side_handle.get_gs_sptrsvU_handle();
 
     this->spgemmHandle = right_side_handle.get_spgemm_handle();
+    this->spaddHandle  = right_side_handle.get_spadd_handle();
 
     this->sptrsvHandle = right_side_handle.get_sptrsv_handle();
     this->spilukHandle = right_side_handle.get_spiluk_handle();
diff --git a/src/sparse/KokkosSparse_IOUtils.hpp b/src/sparse/KokkosSparse_IOUtils.hpp
new file mode 100644
index 0000000000..fa6d08f960
--- /dev/null
+++ b/src/sparse/KokkosSparse_IOUtils.hpp
@@ -0,0 +1,1274 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef _KOKKOSSPARSE_IOUTILS_HPP
+#define _KOKKOSSPARSE_IOUTILS_HPP
+
+#include "KokkosKernels_IOUtils.hpp"
+#include "KokkosSparse_CrsMatrix.hpp"
+
+namespace KokkosSparse {
+namespace Impl {
+
+// MD: Bases on Christian's sparseMatrix_generate function in test_crsmatrix.cpp
+// file.
+template <typename ScalarType, typename OrdinalType, typename SizeType>
+void kk_sparseMatrix_generate(OrdinalType nrows, OrdinalType ncols,
+                              SizeType &nnz, OrdinalType row_size_variance,
+                              OrdinalType bandwidth, ScalarType *&values,
+                              SizeType *&rowPtr, OrdinalType *&colInd,
+                              OrdinalType block_elem_count = 1) {
+  rowPtr = new SizeType[nrows + 1];
+
+  OrdinalType elements_per_row = nrows ? nnz / nrows : 0;
+  srand(13721);
+  rowPtr[0] = 0;
+  for (int row = 0; row < nrows; row++) {
+    int varianz       = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance;
+    int numRowEntries = elements_per_row + varianz;
+    if (numRowEntries < 0) numRowEntries = 0;
+    // Clamping numRowEntries above accomplishes 2 things:
+    //  - If ncols is 0, numRowEntries will also be 0
+    //  - With numRowEntries at most 2/3 the number of columns, in the worst
+    //  case
+    //    90% of insertions will succeed after 6 tries
+    if (numRowEntries > 0.66 * ncols) numRowEntries = 0.66 * ncols;
+    rowPtr[row + 1] = rowPtr[row] + numRowEntries;
+  }
+  nnz    = rowPtr[nrows];
+  values = new ScalarType[nnz];
+  colInd = new OrdinalType[nnz];
+  for (OrdinalType row = 0; row < nrows; row++) {
+    for (SizeType k = rowPtr[row]; k < rowPtr[row + 1]; ++k) {
+      while (true) {
+        OrdinalType pos = (1.0 * rand() / RAND_MAX - 0.5) * bandwidth + row;
+        while (pos < 0) pos += ncols;
+        while (pos >= ncols) pos -= ncols;
+
+        bool is_already_in_the_row = false;
+        for (SizeType j = rowPtr[row]; j < k; j++) {
+          if (colInd[j] == pos) {
+            is_already_in_the_row = true;
+            break;
+          }
+        }
+        if (!is_already_in_the_row) {
+          colInd[k] = pos;
+          break;
+        }
+      }
+    }
+  }
+  // Sample each value from uniform (-50, 50) for real types, or (-50 - 50i, 50
+  // + 50i) for complex types.
+  Kokkos::View<ScalarType *, Kokkos::HostSpace> valuesView(
+      values, nnz * block_elem_count);
+  ScalarType randStart, randEnd;
+  KokkosKernels::Impl::getRandomBounds(50.0, randStart, randEnd);
+  Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace> pool(13718);
+  Kokkos::fill_random(valuesView, pool, randStart, randEnd);
+}
+
+template <typename ScalarType, typename OrdinalType, typename SizeType>
+void kk_sparseMatrix_generate_lower_upper_triangle(
+    char uplo, OrdinalType nrows, OrdinalType ncols, SizeType &nnz,
+    OrdinalType /*row_size_variance*/, OrdinalType /*bandwidth*/,
+    ScalarType *&values, SizeType *&rowPtr, OrdinalType *&colInd) {
+  rowPtr = new SizeType[nrows + 1];
+
+  // OrdinalType elements_per_row = nnz/nrows;
+  srand(13721);
+  rowPtr[0] = 0;
+  for (int row = 0; row < nrows; row++) {
+    if (uplo == 'L')
+      rowPtr[row + 1] = rowPtr[row] + row + 1;
+    else
+      rowPtr[row + 1] = rowPtr[row] + ncols - (row);
+  }
+  nnz    = rowPtr[nrows];
+  values = new ScalarType[nnz];
+  colInd = new OrdinalType[nnz];
+  for (OrdinalType row = 0; row < nrows; row++) {
+    for (SizeType k = rowPtr[row]; k < rowPtr[row + 1]; k++) {
+      if (uplo == 'L')
+        colInd[k] = k - rowPtr[row];
+      else
+        colInd[k] = row + (k - rowPtr[row]);
+      values[k] = 1.0;
+    }
+  }
+}
+
+template <typename ScalarType, typename OrdinalType, typename SizeType>
+void kk_diagonally_dominant_sparseMatrix_generate(
+    OrdinalType nrows, OrdinalType ncols, SizeType &nnz,
+    OrdinalType row_size_variance, OrdinalType bandwidth, ScalarType *&values,
+    SizeType *&rowPtr, OrdinalType *&colInd,
+    ScalarType diagDominance = 10 * Kokkos::ArithTraits<ScalarType>::one()) {
+  rowPtr = new SizeType[nrows + 1];
+
+  OrdinalType elements_per_row = nnz / nrows;
+  srand(13721);
+  rowPtr[0] = 0;
+  for (int row = 0; row < nrows; row++) {
+    int varianz = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance;
+    if (varianz < 1) varianz = 1;
+    if (varianz > 0.75 * ncols) varianz = 0.75 * ncols;
+    rowPtr[row + 1] = rowPtr[row] + elements_per_row + varianz;
+    if (rowPtr[row + 1] <= rowPtr[row])   // This makes sure that there is
+      rowPtr[row + 1] = rowPtr[row] + 1;  // at least one nonzero in the row
+  }
+  nnz    = rowPtr[nrows];
+  values = new ScalarType[nnz];
+  colInd = new OrdinalType[nnz];
+  for (OrdinalType row = 0; row < nrows; row++) {
+    ScalarType total_values = 0;
+    std::unordered_set<OrdinalType> entriesInRow;
+    // We always add the diagonal entry (after this loop)
+    entriesInRow.insert(row);
+    for (SizeType k = rowPtr[row]; k < rowPtr[row + 1] - 1; k++) {
+      while (true) {
+        OrdinalType pos = (1.0 * rand() / RAND_MAX - 0.5) * bandwidth + row;
+        while (pos < 0) pos += ncols;
+        while (pos >= ncols) pos -= ncols;
+
+        if (entriesInRow.find(pos) == entriesInRow.end()) {
+          entriesInRow.insert(pos);
+          colInd[k] = pos;
+          values[k] = 100.0 * rand() / RAND_MAX - 50.0;
+          total_values +=
+              Kokkos::Details::ArithTraits<ScalarType>::abs(values[k]);
+          break;
+        }
+      }
+    }
+
+    colInd[rowPtr[row + 1] - 1] = row;
+    values[rowPtr[row + 1] - 1] = total_values * diagDominance;
+  }
+}
+
+// This function creates a diagonal sparse matrix for testing matrix operations.
+// The elements on the diagonal are 1, 2, ..., n-1, n.
+// If "invert" is true, it will return the inverse of the above diagonal matrix.
+template <typename crsMat_t>
+crsMat_t kk_generate_diag_matrix(typename crsMat_t::const_ordinal_type n,
+                                 const bool invert = false) {
+  typedef typename crsMat_t::ordinal_type ot;
+  typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
+  typedef typename graph_t::entries_type::non_const_type cols_view_t;
+  typedef typename crsMat_t::values_type::non_const_type values_view_t;
+
+  typedef typename row_map_view_t::non_const_value_type size_type;
+  typedef typename cols_view_t::non_const_value_type lno_t;
+  typedef typename values_view_t::non_const_value_type scalar_t;
+
+  row_map_view_t rowmap_view("rowmap_view", n + 1);
+  cols_view_t columns_view("colsmap_view", n);
+  values_view_t values_view("values_view", n);
+
+  {
+    typename row_map_view_t::HostMirror hr =
+        Kokkos::create_mirror_view(rowmap_view);
+    typename cols_view_t::HostMirror hc =
+        Kokkos::create_mirror_view(columns_view);
+    typename values_view_t::HostMirror hv =
+        Kokkos::create_mirror_view(values_view);
+
+    for (lno_t i = 0; i <= n; ++i) {
+      hr(i) = size_type(i);
+    }
+
+    for (ot i = 0; i < n; ++i) {
+      hc(i) = lno_t(i);
+      if (invert) {
+        hv(i) = scalar_t(1.0) / (scalar_t(i + 1));
+      } else {
+        hv(i) = scalar_t(i + 1);
+      }
+    }
+    Kokkos::deep_copy(rowmap_view, hr);
+    Kokkos::deep_copy(columns_view, hc);
+    Kokkos::deep_copy(values_view, hv);
+  }
+
+  graph_t static_graph(columns_view, rowmap_view);
+  crsMat_t crsmat("CrsMatrix", n, values_view, static_graph);
+  return crsmat;
+}
+
+template <typename crsMat_t>
+crsMat_t kk_generate_diagonally_dominant_sparse_matrix(
+    typename crsMat_t::const_ordinal_type nrows,
+    typename crsMat_t::const_ordinal_type ncols,
+    typename crsMat_t::non_const_size_type &nnz,
+    typename crsMat_t::const_ordinal_type row_size_variance,
+    typename crsMat_t::const_ordinal_type bandwidth,
+    typename crsMat_t::const_value_type diagDominance =
+        10 * Kokkos::ArithTraits<typename crsMat_t::value_type>::one()) {
+  typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
+  typedef typename graph_t::entries_type::non_const_type cols_view_t;
+  typedef typename crsMat_t::values_type::non_const_type values_view_t;
+
+  typedef typename row_map_view_t::non_const_value_type size_type;
+  typedef typename cols_view_t::non_const_value_type lno_t;
+  typedef typename values_view_t::non_const_value_type scalar_t;
+  lno_t *adj;
+  size_type *xadj;  //, nnzA;
+  scalar_t *values;
+
+  kk_diagonally_dominant_sparseMatrix_generate<scalar_t, lno_t, size_type>(
+      nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj,
+      diagDominance);
+
+  row_map_view_t rowmap_view("rowmap_view", nrows + 1);
+  cols_view_t columns_view("colsmap_view", nnz);
+  values_view_t values_view("values_view", nnz);
+
+  {
+    typename row_map_view_t::HostMirror hr =
+        Kokkos::create_mirror_view(rowmap_view);
+    typename cols_view_t::HostMirror hc =
+        Kokkos::create_mirror_view(columns_view);
+    typename values_view_t::HostMirror hv =
+        Kokkos::create_mirror_view(values_view);
+
+    for (lno_t i = 0; i <= nrows; ++i) {
+      hr(i) = xadj[i];
+    }
+
+    for (size_type i = 0; i < nnz; ++i) {
+      hc(i) = adj[i];
+      hv(i) = values[i];
+    }
+    Kokkos::deep_copy(rowmap_view, hr);
+    Kokkos::deep_copy(columns_view, hc);
+    Kokkos::deep_copy(values_view, hv);
+  }
+
+  graph_t static_graph(columns_view, rowmap_view);
+  crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph);
+  delete[] xadj;
+  delete[] adj;
+  delete[] values;
+  return crsmat;
+}
+
+template <typename crsMat_t>
+crsMat_t kk_generate_triangular_sparse_matrix(
+    char uplo, typename crsMat_t::const_ordinal_type nrows,
+    typename crsMat_t::const_ordinal_type ncols,
+    typename crsMat_t::non_const_size_type &nnz,
+    typename crsMat_t::const_ordinal_type row_size_variance,
+    typename crsMat_t::const_ordinal_type bandwidth) {
+  typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
+  typedef typename graph_t::entries_type::non_const_type cols_view_t;
+  typedef typename crsMat_t::values_type::non_const_type values_view_t;
+
+  typedef typename row_map_view_t::non_const_value_type size_type;
+  typedef typename cols_view_t::non_const_value_type lno_t;
+  typedef typename values_view_t::non_const_value_type scalar_t;
+  lno_t *adj;
+  size_type *xadj;  //, nnzA;
+  scalar_t *values;
+
+  kk_sparseMatrix_generate_lower_upper_triangle<scalar_t, lno_t, size_type>(
+      uplo, nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj);
+
+  row_map_view_t rowmap_view("rowmap_view", nrows + 1);
+  cols_view_t columns_view("colsmap_view", nnz);
+  values_view_t values_view("values_view", nnz);
+
+  {
+    typename row_map_view_t::HostMirror hr =
+        Kokkos::create_mirror_view(rowmap_view);
+    typename cols_view_t::HostMirror hc =
+        Kokkos::create_mirror_view(columns_view);
+    typename values_view_t::HostMirror hv =
+        Kokkos::create_mirror_view(values_view);
+
+    for (lno_t i = 0; i <= nrows; ++i) {
+      hr(i) = xadj[i];
+    }
+
+    for (size_type i = 0; i < nnz; ++i) {
+      hc(i) = adj[i];
+      hv(i) = values[i];
+    }
+    Kokkos::deep_copy(rowmap_view, hr);
+    Kokkos::deep_copy(columns_view, hc);
+    Kokkos::deep_copy(values_view, hv);
+    Kokkos::fence();
+  }
+
+  graph_t static_graph(columns_view, rowmap_view);
+  crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph);
+  delete[] xadj;
+  delete[] adj;
+  delete[] values;
+  return crsmat;
+}
+
+template <typename crsMat_t>
+crsMat_t kk_generate_sparse_matrix(
+    typename crsMat_t::const_ordinal_type nrows,
+    typename crsMat_t::const_ordinal_type ncols,
+    typename crsMat_t::non_const_size_type &nnz,
+    typename crsMat_t::const_ordinal_type row_size_variance,
+    typename crsMat_t::const_ordinal_type bandwidth) {
+  typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
+  typedef typename graph_t::entries_type::non_const_type cols_view_t;
+  typedef typename crsMat_t::values_type::non_const_type values_view_t;
+
+  typedef typename row_map_view_t::non_const_value_type size_type;
+  typedef typename cols_view_t::non_const_value_type lno_t;
+  typedef typename values_view_t::non_const_value_type scalar_t;
+  lno_t *adj;
+  size_type *xadj;  //, nnzA;
+  scalar_t *values;
+
+  kk_sparseMatrix_generate<scalar_t, lno_t, size_type>(
+      nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj);
+
+  row_map_view_t rowmap_view("rowmap_view", nrows + 1);
+  cols_view_t columns_view("colsmap_view", nnz);
+  values_view_t values_view("values_view", nnz);
+
+  {
+    typename row_map_view_t::HostMirror hr =
+        Kokkos::create_mirror_view(rowmap_view);
+    typename cols_view_t::HostMirror hc =
+        Kokkos::create_mirror_view(columns_view);
+    typename values_view_t::HostMirror hv =
+        Kokkos::create_mirror_view(values_view);
+
+    for (lno_t i = 0; i <= nrows; ++i) {
+      hr(i) = xadj[i];
+    }
+
+    for (size_type i = 0; i < nnz; ++i) {
+      hc(i) = adj[i];
+      hv(i) = values[i];
+    }
+    Kokkos::deep_copy(rowmap_view, hr);
+    Kokkos::deep_copy(columns_view, hc);
+    Kokkos::deep_copy(values_view, hv);
+  }
+
+  graph_t static_graph(columns_view, rowmap_view);
+  crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph);
+  delete[] xadj;
+  delete[] adj;
+  delete[] values;
+  return crsmat;
+}
+
+template <typename bsrMat_t>
+bsrMat_t kk_generate_sparse_matrix(
+    typename bsrMat_t::const_ordinal_type block_dim,
+    typename bsrMat_t::const_ordinal_type nrows,
+    typename bsrMat_t::const_ordinal_type ncols,
+    typename bsrMat_t::non_const_size_type &nnz,
+    typename bsrMat_t::const_ordinal_type row_size_variance,
+    typename bsrMat_t::const_ordinal_type bandwidth) {
+  typedef KokkosSparse::CrsMatrix<
+      typename bsrMat_t::value_type, typename bsrMat_t::ordinal_type,
+      typename bsrMat_t::device_type, typename bsrMat_t::memory_traits,
+      typename bsrMat_t::size_type>
+      crsMat_t;
+
+  const auto crs_mtx = kk_generate_sparse_matrix<crsMat_t>(
+      nrows * block_dim, ncols * block_dim, nnz, row_size_variance, bandwidth);
+  bsrMat_t bsrmat(crs_mtx, block_dim);
+  return bsrmat;
+}
+// TODO: need to fix the size_type. All over the reading inputs are lno_t.
+
+template <typename idx>
+void convert_crs_to_lower_triangle_edge_list(idx nv, idx *xadj, idx *adj,
+                                             idx *lower_triangle_srcs,
+                                             idx *lower_triangle_dests) {
+  idx ind = 0;
+  for (idx i = 0; i < nv; ++i) {
+    idx xb = xadj[i];
+    idx xe = xadj[i + 1];
+    for (idx j = xb; j < xe; ++j) {
+      idx dst = adj[j];
+      if (i < dst) {
+        lower_triangle_srcs[ind]    = i;
+        lower_triangle_dests[ind++] = dst;
+      }
+    }
+  }
+}
+
+template <typename idx>
+void convert_crs_to_edge_list(idx nv, idx *xadj, idx *srcs) {
+  for (idx i = 0; i < nv; ++i) {
+    idx xb = xadj[i];
+    idx xe = xadj[i + 1];
+    for (idx j = xb; j < xe; ++j) {
+      srcs[j] = i;
+    }
+  }
+}
+
+template <typename size_type, typename lno_t, typename wt>
+void convert_edge_list_to_csr(lno_t nv, size_type ne, lno_t *srcs, lno_t *dests,
+                              wt *ew, size_type *xadj, lno_t *adj, wt *crs_ew) {
+  std::vector<struct KokkosKernels::Impl::Edge<lno_t, wt>> edges(ne);
+  for (size_type i = 0; i < ne; ++i) {
+    edges[i].src = srcs[i];
+    edges[i].dst = dests[i];
+    edges[i].ew  = ew[i];
+  }
+  std::sort(edges.begin(), edges.begin() + ne);
+
+  size_type eind = 0;
+  for (lno_t i = 0; i < nv; ++i) {
+    (xadj)[i] = eind;
+    while (edges[eind].src == i) {
+      (adj)[eind]     = edges[eind].dst;
+      (*crs_ew)[eind] = edges[eind].ew;
+      ++eind;
+    }
+  }
+  xadj[nv] = eind;
+}
+
+template <typename in_lno_t, typename size_type, typename lno_t>
+void convert_undirected_edge_list_to_csr(lno_t nv, size_type ne, in_lno_t *srcs,
+                                         in_lno_t *dests, size_type *xadj,
+                                         lno_t *adj) {
+  std::vector<struct KokkosKernels::Impl::Edge<lno_t, double>> edges(ne * 2);
+  for (size_type i = 0; i < ne; ++i) {
+    edges[i * 2].src = srcs[i];
+    edges[i * 2].dst = dests[i];
+
+    edges[i * 2 + 1].src = dests[i];
+    edges[i * 2 + 1].dst = srcs[i];
+  }
+#ifdef KOKKOSKERNELS_HAVE_OUTER
+#include <parallel/multiseq_selection.h>
+#include <parallel/multiway_merge.h>
+#include <parallel/merge.h>
+#include <parallel/multiway_mergesort.h>
+  __gnu_parallel::parallel_sort_mwms<
+      false, true, struct KokkosKernels::Impl::Edge<lno_t, double> *>(
+      &(edges[0]), &(edges[0]) + ne * 2,
+      std::less<struct KokkosKernels::Impl::Edge<lno_t, double>>(), 64);
+#else
+  std::sort(edges.begin(), edges.begin() + ne * 2);
+#endif
+
+  size_type eind = 0;
+  for (lno_t i = 0; i < nv; ++i) {
+    (xadj)[i] = eind;
+    while (edges[eind].src == i) {
+      (adj)[eind] = edges[eind].dst;
+      //(*crs_ew)[eind] = edges[eind].ew;
+      ++eind;
+    }
+  }
+  xadj[nv] = eind;
+}
+
+template <typename lno_t, typename size_type, typename scalar_t>
+void write_graph_bin(lno_t nv, size_type ne, const size_type *xadj,
+                     const lno_t *adj, const scalar_t *ew,
+                     const char *filename) {
+  std::ofstream myFile(filename, std::ios::out | std::ios::binary);
+  myFile.write((char *)&nv, sizeof(lno_t));
+  myFile.write((char *)&ne, sizeof(size_type));
+  myFile.write((char *)xadj, sizeof(size_type) * (nv + 1));
+
+  myFile.write((char *)adj, sizeof(lno_t) * (ne));
+
+  myFile.write((char *)ew, sizeof(scalar_t) * (ne));
+
+  myFile.close();
+}
+
+template <typename lno_t, typename size_type, typename scalar_t>
+void write_graph_crs(lno_t nv, size_type ne, const size_type *xadj,
+                     const lno_t *adj, const scalar_t *ew,
+                     const char *filename) {
+  std::ofstream myFile(filename, std::ios::out);
+  myFile << nv << " " << ne << std::endl;
+
+  for (lno_t i = 0; i <= nv; ++i) {
+    myFile << xadj[i] << " ";
+  }
+  myFile << std::endl;
+
+  for (lno_t i = 0; i < nv; ++i) {
+    size_type b = xadj[i];
+    size_type e = xadj[i + 1];
+    for (size_type j = b; j < e; ++j) {
+      myFile << adj[j] << " ";
+    }
+    myFile << std::endl;
+  }
+  for (size_type i = 0; i < ne; ++i) {
+    myFile << ew[i] << " ";
+  }
+  myFile << std::endl;
+
+  myFile.close();
+}
+
+template <typename lno_t, typename size_type, typename scalar_t>
+void write_graph_ligra(lno_t nv, size_type ne, const size_type *xadj,
+                       const lno_t *adj, const scalar_t * /*ew*/,
+                       const char *filename) {
+  std::ofstream ff(filename);
+  ff << "AdjacencyGraph" << std::endl;
+  ff << nv << std::endl << ne << std::endl;
+  for (lno_t i = 0; i < nv; ++i) {
+    ff << xadj[i] << std::endl;
+  }
+  for (size_type i = 0; i < ne; ++i) {
+    ff << adj[i] << std::endl;
+  }
+  ff.close();
+}
+
+// MM: types and utility functions for parsing the MatrixMarket format
+namespace MM {
+enum MtxObject { UNDEFINED_OBJECT, MATRIX, VECTOR };
+enum MtxFormat { UNDEFINED_FORMAT, COORDINATE, ARRAY };
+enum MtxField {
+  UNDEFINED_FIELD,
+  REAL,     // includes both float and double
+  COMPLEX,  // includes complex<float> and complex<double>
+  INTEGER,  // includes all integer types
+  PATTERN   // not a type, but means the value for every entry is 1
+};
+enum MtxSym {
+  UNDEFINED_SYMMETRY,
+  GENERAL,
+  SYMMETRIC,       // A(i, j) = A(j, i)
+  SKEW_SYMMETRIC,  // A(i, j) = -A(j, i)
+  HERMITIAN        // A(i, j) = a + bi; A(j, i) = a - bi
+};
+
+// readScalar/writeScalar: read and write a scalar in the form that it appears
+// in an .mtx file. The >> and << operators won't work, because complex appears
+// as "real imag", not "(real, imag)"
+template <typename scalar_t>
+scalar_t readScalar(std::istream &is) {
+  scalar_t val;
+  is >> val;
+  return val;
+}
+
+template <>
+inline Kokkos::complex<float> readScalar(std::istream &is) {
+  float r, i;
+  is >> r;
+  is >> i;
+  return Kokkos::complex<float>(r, i);
+}
+
+template <>
+inline Kokkos::complex<double> readScalar(std::istream &is) {
+  double r, i;
+  is >> r;
+  is >> i;
+  return Kokkos::complex<double>(r, i);
+}
+
+template <typename scalar_t>
+void writeScalar(std::ostream &os, scalar_t val) {
+  os << val;
+}
+
+template <>
+inline void writeScalar(std::ostream &os, Kokkos::complex<float> val) {
+  os << val.real() << ' ' << val.imag();
+}
+
+template <>
+inline void writeScalar(std::ostream &os, Kokkos::complex<double> val) {
+  os << val.real() << ' ' << val.imag();
+}
+
+// symmetryFlip: given a value for A(i, j), return the value that
+// should be inserted at A(j, i) (if any)
+template <typename scalar_t>
+scalar_t symmetryFlip(scalar_t val, MtxSym symFlag) {
+  if (symFlag == SKEW_SYMMETRIC) return -val;
+  return val;
+}
+
+template <>
+inline Kokkos::complex<float> symmetryFlip(Kokkos::complex<float> val,
+                                           MtxSym symFlag) {
+  if (symFlag == HERMITIAN)
+    return Kokkos::conj(val);
+  else if (symFlag == SKEW_SYMMETRIC)
+    return -val;
+  return val;
+}
+
+template <>
+inline Kokkos::complex<double> symmetryFlip(Kokkos::complex<double> val,
+                                            MtxSym symFlag) {
+  if (symFlag == HERMITIAN)
+    return Kokkos::conj(val);
+  else if (symFlag == SKEW_SYMMETRIC)
+    return -val;
+  return val;
+}
+}  // namespace MM
+
+template <typename lno_t, typename size_type, typename scalar_t>
+void write_matrix_mtx(lno_t nrows, lno_t ncols, size_type nentries,
+                      const size_type *xadj, const lno_t *adj,
+                      const scalar_t *vals, const char *filename) {
+  std::ofstream myFile(filename);
+  myFile << "%%MatrixMarket matrix coordinate ";
+  if (std::is_same<scalar_t, Kokkos::complex<float>>::value ||
+      std::is_same<scalar_t, Kokkos::complex<double>>::value)
+    myFile << "complex";
+  else
+    myFile << "real";
+  myFile << " general\n";
+  myFile << nrows << " " << ncols << " " << nentries << '\n';
+  myFile << std::setprecision(17) << std::scientific;
+  for (lno_t i = 0; i < nrows; ++i) {
+    size_type b = xadj[i];
+    size_type e = xadj[i + 1];
+    for (size_type j = b; j < e; ++j) {
+      myFile << i + 1 << " " << adj[j] + 1 << " ";
+      MM::writeScalar<scalar_t>(myFile, vals[j]);
+      myFile << '\n';
+    }
+  }
+  myFile.close();
+}
+
+template <typename lno_t, typename size_type, typename scalar_t>
+void write_graph_mtx(lno_t nv, size_type ne, const size_type *xadj,
+                     const lno_t *adj, const scalar_t *ew,
+                     const char *filename) {
+  std::ofstream myFile(filename);
+  myFile << "%%MatrixMarket matrix coordinate ";
+  if (std::is_same<scalar_t, Kokkos::complex<float>>::value ||
+      std::is_same<scalar_t, Kokkos::complex<double>>::value)
+    myFile << "complex";
+  else
+    myFile << "real";
+  myFile << " general\n";
+  myFile << nv << " " << nv << " " << ne << '\n';
+  myFile << std::setprecision(8) << std::scientific;
+  for (lno_t i = 0; i < nv; ++i) {
+    size_type b = xadj[i];
+    size_type e = xadj[i + 1];
+    for (size_type j = b; j < e; ++j) {
+      myFile << i + 1 << " " << (adj)[j] + 1 << " ";
+      MM::writeScalar<scalar_t>(myFile, ew[j]);
+      myFile << '\n';
+    }
+  }
+
+  myFile.close();
+}
+
+template <typename lno_t, typename size_type, typename scalar_t>
+void read_graph_bin(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj,
+                    scalar_t **ew, const char *filename) {
+  std::ifstream myFile(filename, std::ios::in | std::ios::binary);
+
+  myFile.read((char *)nv, sizeof(lno_t));
+  myFile.read((char *)ne, sizeof(size_type));
+  KokkosKernels::Impl::md_malloc<size_type>(xadj, *nv + 1);
+  KokkosKernels::Impl::md_malloc<lno_t>(adj, *ne);
+  KokkosKernels::Impl::md_malloc<scalar_t>(ew, *ne);
+  myFile.read((char *)*xadj, sizeof(size_type) * (*nv + 1));
+  myFile.read((char *)*adj, sizeof(lno_t) * (*ne));
+  myFile.read((char *)*ew, sizeof(scalar_t) * (*ne));
+  myFile.close();
+}
+
+// When Kokkos issue #2313 is resolved, can delete
+// parseScalar and just use operator>>
+template <typename scalar_t>
+scalar_t parseScalar(std::istream &is) {
+  scalar_t val;
+  is >> val;
+  return val;
+}
+
+template <>
+inline Kokkos::complex<float> parseScalar(std::istream &is) {
+  std::complex<float> val;
+  is >> val;
+  return Kokkos::complex<float>(val);
+}
+
+template <>
+inline Kokkos::complex<double> parseScalar(std::istream &is) {
+  std::complex<double> val;
+  is >> val;
+  return Kokkos::complex<double>(val);
+}
+
+template <typename lno_t, typename size_type, typename scalar_t>
+void read_graph_crs(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj,
+                    scalar_t **ew, const char *filename) {
+  std::ifstream myFile(filename, std::ios::in);
+  myFile >> *nv >> *ne;
+
+  KokkosKernels::Impl::md_malloc<size_type>(xadj, *nv + 1);
+  KokkosKernels::Impl::md_malloc<lno_t>(adj, *ne);
+  KokkosKernels::Impl::md_malloc<scalar_t>(ew, *ne);
+
+  for (lno_t i = 0; i <= *nv; ++i) {
+    myFile >> (*xadj)[i];
+  }
+
+  for (size_type i = 0; i < *ne; ++i) {
+    myFile >> (*adj)[i];
+  }
+  for (size_type i = 0; i < *ne; ++i) {
+    (*ew)[i] = parseScalar<scalar_t>(myFile);
+  }
+  myFile.close();
+}
+
+template <typename crs_matrix_t>
+void write_kokkos_crst_matrix(crs_matrix_t a_crsmat, const char *filename) {
+  typedef typename crs_matrix_t::StaticCrsGraphType graph_t;
+  typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
+  typedef typename graph_t::entries_type::non_const_type cols_view_t;
+  typedef typename crs_matrix_t::values_type::non_const_type values_view_t;
+
+  typedef typename row_map_view_t::value_type offset_t;
+  typedef typename cols_view_t::value_type lno_t;
+  typedef typename values_view_t::value_type scalar_t;
+  typedef typename values_view_t::size_type size_type;
+
+  size_type nnz = a_crsmat.nnz();
+
+  auto a_rowmap_view = Kokkos::create_mirror_view_and_copy(
+      Kokkos::HostSpace(), a_crsmat.graph.row_map);
+  auto a_entries_view = Kokkos::create_mirror_view_and_copy(
+      Kokkos::HostSpace(), a_crsmat.graph.entries);
+  auto a_values_view =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a_crsmat.values);
+  offset_t *a_rowmap = const_cast<offset_t *>(a_rowmap_view.data());
+  lno_t *a_entries   = a_entries_view.data();
+  scalar_t *a_values = a_values_view.data();
+
+  std::string strfilename(filename);
+  if (KokkosKernels::Impl::endswith(strfilename, ".mtx") ||
+      KokkosKernels::Impl::endswith(strfilename, ".mm")) {
+    write_matrix_mtx<lno_t, offset_t, scalar_t>(
+        a_crsmat.numRows(), a_crsmat.numCols(), a_crsmat.nnz(), a_rowmap,
+        a_entries, a_values, filename);
+    return;
+  } else if (a_crsmat.numRows() != a_crsmat.numCols()) {
+    throw std::runtime_error(
+        "For formats other than MatrixMarket (suffix .mm or .mtx),\n"
+        "write_kokkos_crst_matrix only supports square matrices");
+  }
+  if (KokkosKernels::Impl::endswith(strfilename, ".bin")) {
+    write_graph_bin<lno_t, offset_t, scalar_t>(
+        a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename);
+  } else if (KokkosKernels::Impl::endswith(strfilename, ".ligra")) {
+    write_graph_ligra<lno_t, offset_t, scalar_t>(
+        a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename);
+  } else if (KokkosKernels::Impl::endswith(strfilename, ".crs")) {
+    write_graph_crs<lno_t, offset_t, scalar_t>(
+        a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename);
+  } else {
+    std::string errMsg =
+        std::string("write_kokkos_crst_matrix: File extension on ") + filename +
+        " does not correspond to a known format";
+    throw std::runtime_error(errMsg);
+  }
+}
+
+template <typename lno_t, typename size_type, typename scalar_t>
+int read_mtx(const char *fileName, lno_t *nrows, lno_t *ncols, size_type *ne,
+             size_type **xadj, lno_t **adj, scalar_t **ew,
+             bool symmetrize = false, bool remove_diagonal = true,
+             bool transpose = false) {
+  using namespace MM;
+  std::ifstream mmf(fileName, std::ifstream::in);
+  if (!mmf.is_open()) {
+    throw std::runtime_error("File cannot be opened\n");
+  }
+
+  std::string fline = "";
+  getline(mmf, fline);
+
+  if (fline.size() < 2 || fline[0] != '%' || fline[1] != '%') {
+    throw std::runtime_error("Invalid MM file. Line-1\n");
+  }
+
+  // make sure every required field is in the file, by initializing them to
+  // UNDEFINED_*
+  MtxObject mtx_object = UNDEFINED_OBJECT;
+  MtxFormat mtx_format = UNDEFINED_FORMAT;
+  MtxField mtx_field   = UNDEFINED_FIELD;
+  MtxSym mtx_sym       = UNDEFINED_SYMMETRY;
+
+  if (fline.find("matrix") != std::string::npos) {
+    mtx_object = MATRIX;
+  } else if (fline.find("vector") != std::string::npos) {
+    mtx_object = VECTOR;
+    throw std::runtime_error(
+        "MatrixMarket \"vector\" is not supported by KokkosKernels read_mtx()");
+  }
+
+  if (fline.find("coordinate") != std::string::npos) {
+    // sparse
+    mtx_format = COORDINATE;
+  } else if (fline.find("array") != std::string::npos) {
+    // dense
+    mtx_format = ARRAY;
+  }
+
+  if (fline.find("real") != std::string::npos ||
+      fline.find("double") != std::string::npos) {
+    if (std::is_same<scalar_t, Kokkos::Experimental::half_t>::value ||
+        std::is_same<scalar_t, Kokkos::Experimental::bhalf_t>::value)
+      mtx_field = REAL;
+    else {
+      if (!std::is_floating_point<scalar_t>::value)
+        throw std::runtime_error(
+            "scalar_t in read_mtx() incompatible with float or double typed "
+            "MatrixMarket file.");
+      else
+        mtx_field = REAL;
+    }
+  } else if (fline.find("complex") != std::string::npos) {
+    if (!(std::is_same<scalar_t, Kokkos::complex<float>>::value ||
+          std::is_same<scalar_t, Kokkos::complex<double>>::value))
+      throw std::runtime_error(
+          "scalar_t in read_mtx() incompatible with complex-typed MatrixMarket "
+          "file.");
+    else
+      mtx_field = COMPLEX;
+  } else if (fline.find("integer") != std::string::npos) {
+    if (std::is_integral<scalar_t>::value ||
+        std::is_floating_point<scalar_t>::value ||
+        std::is_same<scalar_t, Kokkos::Experimental::half_t>::value ||
+        std::is_same<scalar_t, Kokkos::Experimental::bhalf_t>::value)
+      mtx_field = INTEGER;
+    else
+      throw std::runtime_error(
+          "scalar_t in read_mtx() incompatible with integer-typed MatrixMarket "
+          "file.");
+  } else if (fline.find("pattern") != std::string::npos) {
+    mtx_field = PATTERN;
+    // any reasonable choice for scalar_t can represent "1" or "1.0 + 0i", so
+    // nothing to check here
+  }
+
+  if (fline.find("general") != std::string::npos) {
+    mtx_sym = GENERAL;
+  } else if (fline.find("skew-symmetric") != std::string::npos) {
+    mtx_sym = SKEW_SYMMETRIC;
+  } else if (fline.find("symmetric") != std::string::npos) {
+    // checking for "symmetric" after "skew-symmetric" because it's a substring
+    mtx_sym = SYMMETRIC;
+  } else if (fline.find("hermitian") != std::string::npos ||
+             fline.find("Hermitian") != std::string::npos) {
+    mtx_sym = HERMITIAN;
+  }
+  // Validate the matrix attributes
+  if (mtx_format == ARRAY) {
+    if (mtx_sym == UNDEFINED_SYMMETRY) mtx_sym = GENERAL;
+    if (mtx_sym != GENERAL)
+      throw std::runtime_error(
+          "array format MatrixMarket file must have general symmetry (optional "
+          "to include \"general\")");
+  }
+  if (mtx_object == UNDEFINED_OBJECT)
+    throw std::runtime_error(
+        "MatrixMarket file header is missing the object type.");
+  if (mtx_format == UNDEFINED_FORMAT)
+    throw std::runtime_error("MatrixMarket file header is missing the format.");
+  if (mtx_field == UNDEFINED_FIELD)
+    throw std::runtime_error(
+        "MatrixMarket file header is missing the field type.");
+  if (mtx_sym == UNDEFINED_SYMMETRY)
+    throw std::runtime_error(
+        "MatrixMarket file header is missing the symmetry type.");
+
+  while (1) {
+    getline(mmf, fline);
+    if (fline[0] != '%') break;
+  }
+  std::stringstream ss(fline);
+  lno_t nr = 0, nc = 0;
+  size_type nnz = 0;
+  ss >> nr >> nc;
+  if (mtx_format == COORDINATE)
+    ss >> nnz;
+  else
+    nnz = nr * nc;
+  size_type numEdges = nnz;
+  symmetrize         = symmetrize || mtx_sym != GENERAL;
+  if (symmetrize && nr != nc) {
+    throw std::runtime_error("A non-square matrix cannot be symmetrized.");
+  }
+  if (mtx_format == ARRAY) {
+    // Array format only supports general symmetry and non-pattern
+    if (symmetrize)
+      throw std::runtime_error(
+          "array format MatrixMarket file cannot be symmetrized.");
+    if (mtx_field == PATTERN)
+      throw std::runtime_error(
+          "array format MatrixMarket file can't have \"pattern\" field type.");
+  }
+  if (symmetrize) {
+    numEdges = 2 * nnz;
+  }
+  // numEdges is only an upper bound (diagonal entries may be removed)
+  std::vector<struct KokkosKernels::Impl::Edge<lno_t, scalar_t>> edges(
+      numEdges);
+  size_type nE      = 0;
+  lno_t numDiagonal = 0;
+  for (size_type i = 0; i < nnz; ++i) {
+    getline(mmf, fline);
+    std::stringstream ss2(fline);
+    struct KokkosKernels::Impl::Edge<lno_t, scalar_t> tmp;
+    // read source, dest (edge) and weight (value)
+    lno_t s, d;
+    scalar_t w;
+    if (mtx_format == ARRAY) {
+      // In array format, entries are listed in column major order,
+      // so the row and column can be determined just from the index i
+      //(but make them 1-based indices, to match the way coordinate works)
+      s = i % nr + 1;  // row
+      d = i / nr + 1;  // col
+    } else {
+      // In coordinate format, row and col of each entry is read from file
+      ss2 >> s >> d;
+    }
+    if (mtx_field == PATTERN)
+      w = 1;
+    else
+      w = readScalar<scalar_t>(ss2);
+    if (!transpose) {
+      tmp.src = s - 1;
+      tmp.dst = d - 1;
+      tmp.ew  = w;
+    } else {
+      tmp.src = d - 1;
+      tmp.dst = s - 1;
+      tmp.ew  = w;
+    }
+    if (tmp.src == tmp.dst) {
+      numDiagonal++;
+      if (!remove_diagonal) {
+        edges[nE++] = tmp;
+      }
+      continue;
+    }
+    edges[nE++] = tmp;
+    if (symmetrize) {
+      struct KokkosKernels::Impl::Edge<lno_t, scalar_t> tmp2;
+      tmp2.src = tmp.dst;
+      tmp2.dst = tmp.src;
+      // the symmetrized value is w, -w or conj(w) if mtx_sym is
+      // SYMMETRIC, SKEW_SYMMETRIC or HERMITIAN, respectively.
+      tmp2.ew     = symmetryFlip<scalar_t>(tmp.ew, mtx_sym);
+      edges[nE++] = tmp2;
+    }
+  }
+  mmf.close();
+  std::sort(edges.begin(), edges.begin() + nE);
+  if (transpose) {
+    lno_t tmp = nr;
+    nr        = nc;
+    nc        = tmp;
+  }
+  // idx *nv, idx *ne, idx **xadj, idx **adj, wt **wt
+  *nrows = nr;
+  *ncols = nc;
+  *ne    = nE;
+  //*xadj = new idx[nr + 1];
+  KokkosKernels::Impl::md_malloc<size_type>(xadj, nr + 1);
+  //*adj = new idx[nE];
+  KokkosKernels::Impl::md_malloc<lno_t>(adj, nE);
+  //*ew = new wt[nE];
+  KokkosKernels::Impl::md_malloc<scalar_t>(ew, nE);
+  size_type eind   = 0;
+  size_type actual = 0;
+  for (lno_t i = 0; i < nr; ++i) {
+    (*xadj)[i]    = actual;
+    bool is_first = true;
+    while (eind < nE && edges[eind].src == i) {
+      if (is_first || !symmetrize || eind == 0 ||
+          (eind > 0 && edges[eind - 1].dst != edges[eind].dst)) {
+        (*adj)[actual] = edges[eind].dst;
+        (*ew)[actual]  = edges[eind].ew;
+        ++actual;
+      }
+      is_first = false;
+      ++eind;
+    }
+  }
+  (*xadj)[nr] = actual;
+  *ne         = actual;
+  return 0;
+}
+
+// Version of read_mtx which does not capture the number of columns.
+// This is the old interface; it's kept for backwards compatibility.
+template <typename lno_t, typename size_type, typename scalar_t>
+int read_mtx(const char *fileName, lno_t *nv, size_type *ne, size_type **xadj,
+             lno_t **adj, scalar_t **ew, bool symmetrize = false,
+             bool remove_diagonal = true, bool transpose = false) {
+  lno_t ncol;  // will discard
+  return read_mtx<lno_t, size_type, scalar_t>(fileName, nv, &ncol, ne, xadj,
+                                              adj, ew, symmetrize,
+                                              remove_diagonal, transpose);
+}
+
+template <typename lno_t, typename size_type, typename scalar_t>
+void read_matrix(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj,
+                 scalar_t **ew, const char *filename) {
+  std::string strfilename(filename);
+  if (KokkosKernels::Impl::endswith(strfilename, ".mtx") ||
+      KokkosKernels::Impl::endswith(strfilename, ".mm")) {
+    read_mtx(filename, nv, ne, xadj, adj, ew, false, false, false);
+  }
+
+  else if (KokkosKernels::Impl::endswith(strfilename, ".bin")) {
+    read_graph_bin(nv, ne, xadj, adj, ew, filename);
+  }
+
+  else if (KokkosKernels::Impl::endswith(strfilename, ".crs")) {
+    read_graph_crs(nv, ne, xadj, adj, ew, filename);
+  }
+
+  else {
+    throw std::runtime_error("Reader is not available\n");
+  }
+}
+
+template <typename crsMat_t>
+crsMat_t read_kokkos_crst_matrix(const char *filename_) {
+  std::string strfilename(filename_);
+  bool isMatrixMarket = KokkosKernels::Impl::endswith(strfilename, ".mtx") ||
+                        KokkosKernels::Impl::endswith(strfilename, ".mm");
+
+  typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
+  typedef typename graph_t::entries_type::non_const_type cols_view_t;
+  typedef typename crsMat_t::values_type::non_const_type values_view_t;
+
+  typedef typename row_map_view_t::value_type size_type;
+  typedef typename cols_view_t::value_type lno_t;
+  typedef typename values_view_t::value_type scalar_t;
+
+  lno_t nr, nc, *adj;
+  size_type *xadj, nnzA;
+  scalar_t *values;
+
+  if (isMatrixMarket) {
+    // MatrixMarket file contains the exact number of columns
+    read_mtx<lno_t, size_type, scalar_t>(filename_, &nr, &nc, &nnzA, &xadj,
+                                         &adj, &values, false, false, false);
+  } else {
+    //.crs and .bin files don't contain #cols, so will compute it later based on
+    // the entries
+    read_matrix<lno_t, size_type, scalar_t>(&nr, &nnzA, &xadj, &adj, &values,
+                                            filename_);
+  }
+
+  row_map_view_t rowmap_view("rowmap_view", nr + 1);
+  cols_view_t columns_view("colsmap_view", nnzA);
+  values_view_t values_view("values_view", nnzA);
+
+  {
+    Kokkos::View<size_type *, Kokkos::HostSpace,
+                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+        hr(xadj, nr + 1);
+    Kokkos::View<lno_t *, Kokkos::HostSpace,
+                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+        hc(adj, nnzA);
+    Kokkos::View<scalar_t *, Kokkos::HostSpace,
+                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+        hv(values, nnzA);
+    Kokkos::deep_copy(rowmap_view, hr);
+    Kokkos::deep_copy(columns_view, hc);
+    Kokkos::deep_copy(values_view, hv);
+  }
+
+  if (!isMatrixMarket) {
+    KokkosKernels::Impl::kk_view_reduce_max<cols_view_t,
+                                            typename crsMat_t::execution_space>(
+        nnzA, columns_view, nc);
+    nc++;
+  }
+
+  graph_t static_graph(columns_view, rowmap_view);
+  crsMat_t crsmat("CrsMatrix", nc, values_view, static_graph);
+  delete[] xadj;
+  delete[] adj;
+  delete[] values;
+  return crsmat;
+}
+
+template <typename crsGraph_t>
+crsGraph_t read_kokkos_crst_graph(const char *filename_) {
+  typedef typename crsGraph_t::row_map_type::non_const_type row_map_view_t;
+  typedef typename crsGraph_t::entries_type::non_const_type cols_view_t;
+
+  typedef typename row_map_view_t::value_type size_type;
+  typedef typename cols_view_t::value_type lno_t;
+  typedef double scalar_t;
+
+  lno_t nv, *adj;
+  size_type *xadj, nnzA;
+  scalar_t *values;
+  read_matrix<lno_t, size_type, scalar_t>(&nv, &nnzA, &xadj, &adj, &values,
+                                          filename_);
+
+  row_map_view_t rowmap_view("rowmap_view", nv + 1);
+  cols_view_t columns_view("colsmap_view", nnzA);
+
+  {
+    typename row_map_view_t::HostMirror hr =
+        Kokkos::create_mirror_view(rowmap_view);
+    typename cols_view_t::HostMirror hc =
+        Kokkos::create_mirror_view(columns_view);
+
+    for (lno_t i = 0; i <= nv; ++i) {
+      hr(i) = xadj[i];
+    }
+
+    for (size_type i = 0; i < nnzA; ++i) {
+      hc(i) = adj[i];
+    }
+    Kokkos::deep_copy(rowmap_view, hr);
+    Kokkos::deep_copy(columns_view, hc);
+  }
+
+  lno_t ncols = 0;
+  KokkosKernels::Impl::kk_view_reduce_max<cols_view_t,
+                                          typename crsGraph_t::execution_space>(
+      nnzA, columns_view, ncols);
+  ncols += 1;
+
+  crsGraph_t static_graph(columns_view, rowmap_view, ncols);
+  delete[] xadj;
+  delete[] adj;
+  delete[] values;
+  return static_graph;
+}
+
+template <typename size_type, typename nnz_lno_t>
+inline void kk_sequential_create_incidence_matrix(
+    nnz_lno_t num_rows, const size_type *xadj, const nnz_lno_t *adj,
+    size_type *i_adj  // output. preallocated
+) {
+  std::vector<size_type> c_xadj(num_rows);
+  for (nnz_lno_t i = 0; i < num_rows; i++) {
+    c_xadj[i] = xadj[i];
+  }
+  int eCnt = 0;
+  for (nnz_lno_t i = 0; i < num_rows; i++) {
+    size_type begin   = xadj[i];
+    size_type end     = xadj[i + 1];
+    nnz_lno_t adjsize = end - begin;
+
+    for (nnz_lno_t j = 0; j < adjsize; j++) {
+      size_type aind = j + begin;
+      nnz_lno_t col  = adj[aind];
+      if (i < col) {
+        i_adj[c_xadj[i]++]   = eCnt;
+        i_adj[c_xadj[col]++] = eCnt++;
+      }
+    }
+  }
+
+  for (nnz_lno_t i = 0; i < num_rows; i++) {
+    if (c_xadj[i] != xadj[i + 1]) {
+      std::cout << "i:" << i << " c_xadj[i]:" << c_xadj[i]
+                << " xadj[i+1]:" << xadj[i + 1] << std::endl;
+    }
+  }
+}
+
+template <typename size_type, typename nnz_lno_t>
+inline void kk_sequential_create_incidence_matrix_transpose(
+    const nnz_lno_t num_rows, const size_type num_edges, const size_type *xadj,
+    const nnz_lno_t *adj,
+    size_type *i_xadj,  // output. preallocated
+    nnz_lno_t *i_adj    // output. preallocated
+) {
+  for (nnz_lno_t i = 0; i < num_edges / 2 + 1; i++) {
+    i_xadj[i] = i * 2;
+  }
+  int eCnt = 0;
+  for (nnz_lno_t i = 0; i < num_rows; i++) {
+    size_type begin   = xadj[i];
+    size_type end     = xadj[i + 1];
+    nnz_lno_t adjsize = end - begin;
+
+    for (nnz_lno_t j = 0; j < adjsize; j++) {
+      size_type aind = j + begin;
+      nnz_lno_t col  = adj[aind];
+      if (i < col) {
+        i_adj[eCnt++] = i;
+        i_adj[eCnt++] = col;
+      }
+    }
+  }
+}
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+#endif  // _KOKKOSSPARSE_IOUTILS_HPP
diff --git a/src/sparse/KokkosSparse_SortCrs.hpp b/src/sparse/KokkosSparse_SortCrs.hpp
new file mode 100644
index 0000000000..68de6b5f7c
--- /dev/null
+++ b/src/sparse/KokkosSparse_SortCrs.hpp
@@ -0,0 +1,722 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef _KOKKOSSPARSE_SORTCRS_HPP
+#define _KOKKOSSPARSE_SORTCRS_HPP
+
+#include "Kokkos_Core.hpp"
+#include "KokkosKernels_Sorting.hpp"
+
+namespace KokkosSparse {
+
+// ----------------------------------
+// BSR matrix/graph sorting utilities
+// ----------------------------------
+
+// Sort a BRS matrix: within each row, sort entries ascending by column and
+// permute the values accordingly.
+template <typename execution_space, typename rowmap_t, typename entries_t,
+          typename values_t,
+          typename lno_t = typename entries_t::non_const_value_type>
+void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap,
+                     const entries_t& entries, const values_t& values);
+
+template <typename bsrMat_t>
+void sort_bsr_matrix(const bsrMat_t& A);
+
+// ----------------------------------
+// CRS matrix/graph sorting utilities
+// ----------------------------------
+
+// The sort_crs* functions sort the adjacent column list for each row into
+// ascending order.
+
+template <typename execution_space, typename rowmap_t, typename entries_t,
+          typename values_t>
+void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries,
+                     const values_t& values);
+
+template <typename crsMat_t>
+void sort_crs_matrix(const crsMat_t& A);
+
+template <typename execution_space, typename rowmap_t, typename entries_t>
+void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries);
+
+template <typename crsGraph_t>
+void sort_crs_graph(const crsGraph_t& G);
+
+// sort_and_merge_matrix produces a new matrix which is equivalent to A but is
+// sorted and has no duplicated entries: each (i, j) is unique. Values for
+// duplicated entries are summed.
+template <typename crsMat_t>
+crsMat_t sort_and_merge_matrix(const crsMat_t& A);
+
+template <typename crsGraph_t>
+crsGraph_t sort_and_merge_graph(const crsGraph_t& G);
+
+template <typename exec_space, typename rowmap_t, typename entries_t>
+void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in,
+                          const entries_t& entries_in, rowmap_t& rowmap_out,
+                          entries_t& entries_out);
+
+namespace Impl {
+
+template <typename execution_space, typename rowmap_t, typename entries_t,
+          typename values_t>
+struct SortCrsMatrixFunctor {
+  using size_type = typename rowmap_t::non_const_value_type;
+  using lno_t     = typename entries_t::non_const_value_type;
+  using scalar_t  = typename values_t::non_const_value_type;
+  using team_mem  = typename Kokkos::TeamPolicy<execution_space>::member_type;
+  // The functor owns memory for entriesAux, so it can't have
+  // MemoryTraits<Unmanaged>
+  using entries_managed_t = Kokkos::View<typename entries_t::data_type,
+                                         typename entries_t::device_type>;
+  using values_managed_t  = Kokkos::View<typename values_t::data_type,
+                                        typename values_t::device_type>;
+
+  SortCrsMatrixFunctor(bool usingRangePol, const rowmap_t& rowmap_,
+                       const entries_t& entries_, const values_t& values_)
+      : rowmap(rowmap_), entries(entries_), values(values_) {
+    if (usingRangePol) {
+      entriesAux = entries_managed_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"),
+          entries.extent(0));
+      valuesAux = values_managed_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values aux"),
+          values.extent(0));
+    }
+    // otherwise, aux arrays won't be allocated (sorting in place)
+  }
+
+  KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const {
+    size_type rowStart = rowmap(i);
+    size_type rowEnd   = rowmap(i + 1);
+    lno_t rowNum       = rowEnd - rowStart;
+    // Radix sort requires unsigned keys for comparison
+    using unsigned_lno_t = typename std::make_unsigned<lno_t>::type;
+    KokkosKernels::SerialRadixSort2<lno_t, unsigned_lno_t, scalar_t>(
+        (unsigned_lno_t*)entries.data() + rowStart,
+        (unsigned_lno_t*)entriesAux.data() + rowStart, values.data() + rowStart,
+        valuesAux.data() + rowStart, rowNum);
+  }
+
+  KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const {
+    size_type i        = t.league_rank();
+    size_type rowStart = rowmap(i);
+    size_type rowEnd   = rowmap(i + 1);
+    lno_t rowNum       = rowEnd - rowStart;
+    KokkosKernels::TeamBitonicSort2<lno_t, lno_t, scalar_t, team_mem>(
+        entries.data() + rowStart, values.data() + rowStart, rowNum, t);
+  }
+
+  rowmap_t rowmap;
+  entries_t entries;
+  entries_managed_t entriesAux;
+  values_t values;
+  values_managed_t valuesAux;
+};
+
+template <typename execution_space, typename rowmap_t, typename entries_t>
+struct SortCrsGraphFunctor {
+  using size_type = typename rowmap_t::non_const_value_type;
+  using lno_t     = typename entries_t::non_const_value_type;
+  using team_mem  = typename Kokkos::TeamPolicy<execution_space>::member_type;
+  // The functor owns memory for entriesAux, so it can't have
+  // MemoryTraits<Unmanaged>
+  using entries_managed_t = Kokkos::View<typename entries_t::data_type,
+                                         typename entries_t::device_type>;
+
+  SortCrsGraphFunctor(bool usingRangePol, const rowmap_t& rowmap_,
+                      const entries_t& entries_)
+      : rowmap(rowmap_), entries(entries_) {
+    if (usingRangePol) {
+      entriesAux = entries_managed_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"),
+          entries.extent(0));
+    }
+    // otherwise, aux arrays won't be allocated (sorting in place)
+  }
+
+  KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const {
+    size_type rowStart = rowmap(i);
+    size_type rowEnd   = rowmap(i + 1);
+    lno_t rowNum       = rowEnd - rowStart;
+    // Radix sort requires unsigned keys for comparison
+    using unsigned_lno_t = typename std::make_unsigned<lno_t>::type;
+    KokkosKernels::SerialRadixSort<lno_t, unsigned_lno_t>(
+        (unsigned_lno_t*)entries.data() + rowStart,
+        (unsigned_lno_t*)entriesAux.data() + rowStart, rowNum);
+  }
+
+  KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const {
+    size_type i        = t.league_rank();
+    size_type rowStart = rowmap(i);
+    size_type rowEnd   = rowmap(i + 1);
+    lno_t rowNum       = rowEnd - rowStart;
+    KokkosKernels::TeamBitonicSort<lno_t, lno_t, team_mem>(
+        entries.data() + rowStart, rowNum, t);
+  }
+
+  rowmap_t rowmap;
+  entries_t entries;
+  entries_managed_t entriesAux;
+};
+
+template <typename rowmap_t, typename entries_t>
+struct MergedRowmapFunctor {
+  using size_type  = typename rowmap_t::non_const_value_type;
+  using lno_t      = typename entries_t::non_const_value_type;
+  using c_rowmap_t = typename rowmap_t::const_type;
+
+  // Precondition: entries are sorted within each row
+  MergedRowmapFunctor(const rowmap_t& mergedCounts_, const c_rowmap_t& rowmap_,
+                      const entries_t& entries_)
+      : mergedCounts(mergedCounts_), rowmap(rowmap_), entries(entries_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(lno_t row, size_type& lnewNNZ) const {
+    size_type rowBegin = rowmap(row);
+    size_type rowEnd   = rowmap(row + 1);
+    if (rowEnd == rowBegin) {
+      // Row was empty to begin with
+      mergedCounts(row) = 0;
+      return;
+    }
+    // Otherwise, the first entry in the row exists
+    lno_t uniqueEntries = 1;
+    for (size_type j = rowBegin + 1; j < rowEnd; j++) {
+      if (entries(j - 1) != entries(j)) uniqueEntries++;
+    }
+    mergedCounts(row) = uniqueEntries;
+    lnewNNZ += uniqueEntries;
+    if (row == lno_t((rowmap.extent(0) - 1) - 1)) mergedCounts(row + 1) = 0;
+  }
+
+  rowmap_t mergedCounts;
+  c_rowmap_t rowmap;
+  entries_t entries;
+};
+
+template <typename rowmap_t, typename entries_t, typename values_t>
+struct MatrixMergedEntriesFunctor {
+  using size_type = typename rowmap_t::non_const_value_type;
+  using lno_t     = typename entries_t::non_const_value_type;
+  using scalar_t  = typename values_t::non_const_value_type;
+
+  // Precondition: entries are sorted within each row
+  MatrixMergedEntriesFunctor(const rowmap_t& rowmap_, const entries_t& entries_,
+                             const values_t& values_,
+                             const rowmap_t& mergedRowmap_,
+                             const entries_t& mergedEntries_,
+                             const values_t& mergedValues_)
+      : rowmap(rowmap_),
+        entries(entries_),
+        values(values_),
+        mergedRowmap(mergedRowmap_),
+        mergedEntries(mergedEntries_),
+        mergedValues(mergedValues_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const {
+    size_type rowBegin = rowmap(row);
+    size_type rowEnd   = rowmap(row + 1);
+    if (rowEnd == rowBegin) {
+      // Row was empty to begin with, nothing to do
+      return;
+    }
+    // Otherwise, accumulate the value for each column
+    scalar_t accumVal   = values(rowBegin);
+    lno_t accumCol      = entries(rowBegin);
+    size_type insertPos = mergedRowmap(row);
+    for (size_type j = rowBegin + 1; j < rowEnd; j++) {
+      if (accumCol == entries(j)) {
+        // accumulate
+        accumVal += values(j);
+      } else {
+        // write out and reset
+        mergedValues(insertPos)  = accumVal;
+        mergedEntries(insertPos) = accumCol;
+        insertPos++;
+        accumVal = values(j);
+        accumCol = entries(j);
+      }
+    }
+    // always left with the last unique entry
+    mergedValues(insertPos)  = accumVal;
+    mergedEntries(insertPos) = accumCol;
+  }
+
+  rowmap_t rowmap;
+  entries_t entries;
+  values_t values;
+  rowmap_t mergedRowmap;
+  entries_t mergedEntries;
+  values_t mergedValues;
+};
+
+template <typename rowmap_t, typename entries_t>
+struct GraphMergedEntriesFunctor {
+  using size_type = typename rowmap_t::non_const_value_type;
+  using lno_t     = typename entries_t::non_const_value_type;
+
+  // Precondition: entries are sorted within each row
+  GraphMergedEntriesFunctor(const rowmap_t& rowmap_, const entries_t& entries_,
+                            const rowmap_t& mergedRowmap_,
+                            const entries_t& mergedEntries_)
+      : rowmap(rowmap_),
+        entries(entries_),
+        mergedRowmap(mergedRowmap_),
+        mergedEntries(mergedEntries_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const {
+    size_type rowBegin = rowmap(row);
+    size_type rowEnd   = rowmap(row + 1);
+    if (rowEnd == rowBegin) {
+      // Row was empty to begin with, nothing to do
+      return;
+    }
+    // Otherwise, accumulate the value for each column
+    lno_t accumCol      = entries(rowBegin);
+    size_type insertPos = mergedRowmap(row);
+    for (size_type j = rowBegin + 1; j < rowEnd; j++) {
+      if (accumCol != entries(j)) {
+        // write out and reset
+        mergedEntries(insertPos) = accumCol;
+        insertPos++;
+        accumCol = entries(j);
+      }
+    }
+    // always left with the last unique entry
+    mergedEntries(insertPos) = accumCol;
+  }
+
+  rowmap_t rowmap;
+  entries_t entries;
+  rowmap_t mergedRowmap;
+  entries_t mergedEntries;
+};
+
+template <typename T>
+KOKKOS_INLINE_FUNCTION void kk_swap(T& a, T& b) {
+  T t = a;
+  a   = b;
+  b   = t;
+}
+
+template <typename row_map_type, typename entries_type, typename values_type>
+struct sort_bsr_functor {
+  using lno_t = typename entries_type::non_const_value_type;
+
+  row_map_type rowmap;
+  entries_type entries;
+  values_type values;
+  const lno_t blocksize;
+
+  sort_bsr_functor(row_map_type rowmap_, entries_type entries_,
+                   values_type values_, const lno_t blocksize_)
+      : rowmap(rowmap_),
+        entries(entries_),
+        values(values_),
+        blocksize(blocksize_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const lno_t i) const {
+    const lno_t rowStart = rowmap(i);
+    const lno_t rowSize  = rowmap(i + 1) - rowStart;
+    auto* e              = entries.data() + rowStart;
+    auto* v              = values.data() + rowStart * blocksize;
+    bool done            = false;
+    while (!done) {
+      done = true;
+      for (lno_t j = 1; j < rowSize; ++j) {
+        const lno_t jp = j - 1;
+        if (e[jp] <= e[j]) continue;
+        Impl::kk_swap(e[jp], e[j]);
+        auto const vb  = v + j * blocksize;
+        auto const vbp = v + jp * blocksize;
+        for (lno_t k = 0; k < blocksize;
+             ++k)  // std::swap_ranges(vb, vb + blocksize, vbp);
+          Impl::kk_swap(vb[k], vbp[k]);
+        done = false;
+      }
+    }
+  }
+};
+
+}  // namespace Impl
+
+// Sort a CRS matrix: within each row, sort entries ascending by column.
+// At the same time, permute the values.
+template <typename execution_space, typename rowmap_t, typename entries_t,
+          typename values_t>
+void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries,
+                     const values_t& values) {
+  using lno_t    = typename entries_t::non_const_value_type;
+  using team_pol = Kokkos::TeamPolicy<execution_space>;
+  bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>();
+  lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
+  if (numRows == 0) return;
+  Impl::SortCrsMatrixFunctor<execution_space, rowmap_t, entries_t, values_t>
+      funct(useRadix, rowmap, entries, values);
+  if (useRadix) {
+    Kokkos::parallel_for("sort_crs_matrix",
+                         Kokkos::RangePolicy<execution_space>(0, numRows),
+                         funct);
+  } else {
+    // Try to get teamsize to be largest power of 2 not greater than avg entries
+    // per row
+    // TODO (probably important for performnce): add thread-level sort also, and
+    // use that for small avg degree. But this works for now.
+    lno_t idealTeamSize = 1;
+    lno_t avgDeg        = (entries.extent(0) + numRows - 1) / numRows;
+    while (idealTeamSize < avgDeg / 2) {
+      idealTeamSize *= 2;
+    }
+    team_pol temp(numRows, 1);
+    lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag());
+    lno_t teamSize    = std::min(idealTeamSize, maxTeamSize);
+    Kokkos::parallel_for("sort_crs_matrix", team_pol(numRows, teamSize), funct);
+  }
+}
+
+template <typename crsMat_t>
+void sort_crs_matrix(const crsMat_t& A) {
+  // Note: rowmap_t has const values, but that's OK as sorting doesn't modify it
+  using rowmap_t   = typename crsMat_t::row_map_type;
+  using entries_t  = typename crsMat_t::index_type::non_const_type;
+  using values_t   = typename crsMat_t::values_type::non_const_type;
+  using exec_space = typename crsMat_t::execution_space;
+  // NOTE: the rowmap of a StaticCrsGraph is const-valued, but the
+  // entries and CrsMatrix values are non-const (so sorting them directly
+  // is allowed)
+  sort_crs_matrix<exec_space, rowmap_t, entries_t, values_t>(
+      A.graph.row_map, A.graph.entries, A.values);
+}
+
+// Sort a BRS matrix: within each row, sort entries ascending by column and
+// permute the values accordingly.
+template <typename execution_space, typename rowmap_t, typename entries_t,
+          typename values_t, typename lno_t>
+void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap,
+                     const entries_t& entries, const values_t& values) {
+  // TODO: this is O(N^2) mock for debugging - do regular implementation based
+  // on Radix/Bitonic sort (like CSR) IDEA: maybe we need only one general
+  // Radix2/Bitonic2 and CSR sorting may call it with blockSize=1 ?
+  lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
+  if (numRows == 0) return;
+  const lno_t blocksize = blockdim * blockdim;
+
+  assert(values.extent(0) == entries.extent(0) * blocksize);
+  Impl::sort_bsr_functor<rowmap_t, entries_t, values_t> bsr_sorter(
+      rowmap, entries, values, blocksize);
+  Kokkos::parallel_for("sort_bsr_matrix",
+                       Kokkos::RangePolicy<execution_space>(0, numRows),
+                       bsr_sorter);
+}
+
+// Sort a BSR matrix (like CRS but single values are replaced with contignous
+// blocks)
+template <typename bsrMat_t>
+void sort_bsr_matrix(const bsrMat_t& A) {
+  // NOTE: unlike rowmap, entries and values are non-const, so we can sort them
+  // directly
+  sort_bsr_matrix<typename bsrMat_t::execution_space,
+                  typename bsrMat_t::row_map_type,
+                  typename bsrMat_t::index_type::non_const_type,
+                  typename bsrMat_t::values_type::non_const_type>(
+      A.blockDim(), A.graph.row_map, A.graph.entries, A.values);
+}
+
+// Sort a CRS graph: within each row, sort entries ascending by column.
+template <typename execution_space, typename rowmap_t, typename entries_t>
+void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) {
+  using lno_t    = typename entries_t::non_const_value_type;
+  using team_pol = Kokkos::TeamPolicy<execution_space>;
+  bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>();
+  lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
+  if (numRows == 0) return;
+  Impl::SortCrsGraphFunctor<execution_space, rowmap_t, entries_t> funct(
+      useRadix, rowmap, entries);
+  if (useRadix) {
+    Kokkos::parallel_for("sort_crs_graph",
+                         Kokkos::RangePolicy<execution_space>(0, numRows),
+                         funct);
+  } else {
+    // Try to get teamsize to be largest power of 2 less than or equal to
+    // half the entries per row. 0.5 * #entries is bitonic's parallelism within
+    // a row.
+    // TODO (probably important for performnce): add thread-level sort also, and
+    // use that for small avg degree. But this works for now.
+    lno_t idealTeamSize = 1;
+    lno_t avgDeg        = (entries.extent(0) + numRows - 1) / numRows;
+    while (idealTeamSize < avgDeg / 2) {
+      idealTeamSize *= 2;
+    }
+    team_pol temp(numRows, 1);
+    lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag());
+    lno_t teamSize    = std::min(idealTeamSize, maxTeamSize);
+    Kokkos::parallel_for("sort_crs_graph", team_pol(numRows, teamSize), funct);
+  }
+}
+
+template <typename crsGraph_t>
+void sort_crs_graph(const crsGraph_t& G) {
+  static_assert(
+      !std::is_const<typename crsGraph_t::entries_type::value_type>::value,
+      "sort_crs_graph requires StaticCrsGraph entries to be non-const.");
+  sort_crs_graph<typename crsGraph_t::execution_space,
+                 typename crsGraph_t::row_map_type,
+                 typename crsGraph_t::entries_type>(G.row_map, G.entries);
+}
+
+// Sort the rows of matrix, and merge duplicate entries.
+template <typename crsMat_t>
+crsMat_t sort_and_merge_matrix(const crsMat_t& A) {
+  using c_rowmap_t = typename crsMat_t::row_map_type;
+  using rowmap_t   = typename crsMat_t::row_map_type::non_const_type;
+  using entries_t  = typename crsMat_t::index_type::non_const_type;
+  using values_t   = typename crsMat_t::values_type::non_const_type;
+  using size_type  = typename rowmap_t::non_const_value_type;
+  using exec_space = typename crsMat_t::execution_space;
+  using range_t    = Kokkos::RangePolicy<exec_space>;
+  sort_crs_matrix(A);
+  // Count entries per row into a new rowmap, in terms of merges that can be
+  // done
+  rowmap_t mergedRowmap(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"),
+      A.numRows() + 1);
+  size_type numCompressedEntries = 0;
+  Kokkos::parallel_reduce(range_t(0, A.numRows()),
+                          Impl::MergedRowmapFunctor<rowmap_t, entries_t>(
+                              mergedRowmap, A.graph.row_map, A.graph.entries),
+                          numCompressedEntries);
+  // Prefix sum to get rowmap
+  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<rowmap_t, exec_space>(
+      A.numRows() + 1, mergedRowmap);
+  entries_t mergedEntries("SortedMerged entries", numCompressedEntries);
+  values_t mergedValues("SortedMerged values", numCompressedEntries);
+  // Compute merged entries and values
+  Kokkos::parallel_for(
+      range_t(0, A.numRows()),
+      Impl::MatrixMergedEntriesFunctor<c_rowmap_t, entries_t, values_t>(
+          A.graph.row_map, A.graph.entries, A.values, mergedRowmap,
+          mergedEntries, mergedValues));
+  // Finally, construct the new compressed matrix
+  return crsMat_t("SortedMerged", A.numRows(), A.numCols(),
+                  numCompressedEntries, mergedValues, mergedRowmap,
+                  mergedEntries);
+}
+
+template <typename exec_space, typename rowmap_t, typename entries_t>
+void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in,
+                          const entries_t& entries_in, rowmap_t& rowmap_out,
+                          entries_t& entries_out) {
+  using size_type      = typename rowmap_t::non_const_value_type;
+  using lno_t          = typename entries_t::non_const_value_type;
+  using range_t        = Kokkos::RangePolicy<exec_space>;
+  using const_rowmap_t = typename rowmap_t::const_type;
+  lno_t numRows        = rowmap_in.extent(0);
+  if (numRows <= 1) {
+    // Matrix has zero rows
+    rowmap_out  = rowmap_t();
+    entries_out = entries_t();
+    return;
+  }
+  numRows--;
+  // Sort in place
+  sort_crs_graph<exec_space, const_rowmap_t, entries_t>(rowmap_in, entries_in);
+  // Count entries per row into a new rowmap, in terms of merges that can be
+  // done
+  rowmap_out = rowmap_t(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"),
+      numRows + 1);
+  size_type numCompressedEntries = 0;
+  Kokkos::parallel_reduce(range_t(0, numRows),
+                          Impl::MergedRowmapFunctor<rowmap_t, entries_t>(
+                              rowmap_out, rowmap_in, entries_in),
+                          numCompressedEntries);
+  // Prefix sum to get rowmap
+  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<rowmap_t, exec_space>(
+      numRows + 1, rowmap_out);
+  entries_out = entries_t("SortedMerged entries", numCompressedEntries);
+  // Compute merged entries and values
+  Kokkos::parallel_for(
+      range_t(0, numRows),
+      Impl::GraphMergedEntriesFunctor<const_rowmap_t, entries_t>(
+          rowmap_in, entries_in, rowmap_out, entries_out));
+}
+
+template <typename crsGraph_t>
+crsGraph_t sort_and_merge_graph(const crsGraph_t& G) {
+  using rowmap_t  = typename crsGraph_t::row_map_type::non_const_type;
+  using entries_t = typename crsGraph_t::entries_type;
+  static_assert(
+      !std::is_const<typename entries_t::value_type>::value,
+      "sort_and_merge_graph requires StaticCrsGraph entries to be non-const.");
+  rowmap_t mergedRowmap;
+  entries_t mergedEntries;
+  sort_and_merge_graph<typename crsGraph_t::execution_space, rowmap_t,
+                       entries_t>(G.row_map, G.entries, mergedRowmap,
+                                  mergedEntries);
+  return crsGraph_t(mergedEntries, mergedRowmap);
+}
+
+}  // namespace KokkosSparse
+
+namespace KokkosKernels {
+
+// ----------------------------------
+// BSR matrix/graph sorting utilities
+// ----------------------------------
+
+// Sort a BRS matrix: within each row, sort entries ascending by column and
+// permute the values accordingly.
+template <typename execution_space, typename rowmap_t, typename entries_t,
+          typename values_t,
+          typename lno_t = typename entries_t::non_const_value_type>
+[[deprecated]] void sort_bsr_matrix(const lno_t blockdim,
+                                    const rowmap_t& rowmap,
+                                    const entries_t& entries,
+                                    const values_t& values) {
+  KokkosSparse::sort_bsr_matrix(blockdim, rowmap, entries, values);
+}
+
+template <typename bsrMat_t>
+[[deprecated]] void sort_bsr_matrix(const bsrMat_t& A) {
+  KokkosSparse::sort_bsr_matrix(A);
+}
+
+// ----------------------------------
+// CRS matrix/graph sorting utilities
+// ----------------------------------
+
+// The sort_crs* functions sort the adjacent column list for each row into
+// ascending order.
+
+template <typename execution_space, typename rowmap_t, typename entries_t,
+          typename values_t>
+[[deprecated]] void sort_crs_matrix(const rowmap_t& rowmap,
+                                    const entries_t& entries,
+                                    const values_t& values) {
+  KokkosSparse::sort_crs_matrix<execution_space, rowmap_t, entries_t>(
+      rowmap, entries, values);
+}
+
+template <typename crsMat_t>
+[[deprecated]] void sort_crs_matrix(const crsMat_t& A) {
+  KokkosSparse::sort_crs_matrix(A);
+}
+
+template <typename execution_space, typename rowmap_t, typename entries_t>
+[[deprecated]] void sort_crs_graph(const rowmap_t& rowmap,
+                                   const entries_t& entries) {
+  KokkosSparse::sort_crs_graph<execution_space, rowmap_t, entries_t>(rowmap,
+                                                                     entries);
+}
+
+template <typename crsGraph_t>
+[[deprecated]] void sort_crs_graph(const crsGraph_t& G) {
+  KokkosSparse::sort_crs_graph(G);
+}
+
+// sort_and_merge_matrix produces a new matrix which is equivalent to A but is
+// sorted and has no duplicated entries: each (i, j) is unique. Values for
+// duplicated entries are summed.
+template <typename crsMat_t>
+[[deprecated]] crsMat_t sort_and_merge_matrix(const crsMat_t& A) {
+  KokkosSparse::sort_and_merge_matrix(A);
+}
+
+template <typename crsGraph_t>
+[[deprecated]] crsGraph_t sort_and_merge_graph(const crsGraph_t& G) {
+  KokkosSparse::sort_and_merge_graph(G);
+}
+
+template <typename exec_space, typename rowmap_t, typename entries_t>
+[[deprecated]] void sort_and_merge_graph(
+    const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in,
+    rowmap_t& rowmap_out, entries_t& entries_out) {
+  KokkosSparse::sort_and_merge_graph(rowmap_in, entries_in, rowmap_out,
+                                     entries_out);
+}
+
+// For backward compatibility: keep the public interface accessible in
+// KokkosKernels::Impl::
+namespace Impl {
+template <typename execution_space, typename rowmap_t, typename entries_t>
+[[deprecated]] void sort_crs_graph(const rowmap_t& rowmap,
+                                   const entries_t& entries) {
+  KokkosKernels::sort_crs_graph<execution_space, rowmap_t, entries_t>(rowmap,
+                                                                      entries);
+}
+
+template <typename execution_space, typename rowmap_t, typename entries_t,
+          typename values_t>
+[[deprecated]] void sort_crs_matrix(const rowmap_t& rowmap,
+                                    const entries_t& entries,
+                                    const values_t& values) {
+  KokkosKernels::sort_crs_matrix<execution_space, rowmap_t, entries_t,
+                                 values_t>(rowmap, entries, values);
+}
+
+template <typename crsMat_t>
+[[deprecated]] void sort_crs_matrix(const crsMat_t& A) {
+  KokkosKernels::sort_crs_matrix(A);
+}
+
+template <typename exec_space, typename rowmap_t, typename entries_t>
+[[deprecated]] void sort_and_merge_graph(
+    const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in,
+    rowmap_t& rowmap_out, entries_t& entries_out) {
+  KokkosKernels::sort_and_merge_graph<exec_space, rowmap_t, entries_t>(
+      rowmap_in, entries_in, rowmap_out, entries_out);
+}
+
+template <typename crsMat_t>
+[[deprecated]] crsMat_t sort_and_merge_matrix(const crsMat_t& A) {
+  return KokkosKernels::sort_and_merge_matrix(A);
+}
+
+}  // namespace Impl
+}  // namespace KokkosKernels
+
+#endif  // _KOKKOSSPARSE_SORTCRS_HPP
diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/sparse/KokkosSparse_Utils.hpp
similarity index 90%
rename from src/common/KokkosKernels_SparseUtils.hpp
rename to src/sparse/KokkosSparse_Utils.hpp
index 323ae7846f..007b2aea85 100644
--- a/src/common/KokkosKernels_SparseUtils.hpp
+++ b/src/sparse/KokkosSparse_Utils.hpp
@@ -57,7 +57,7 @@
 #include <parallel/algorithm>
 #endif
 
-namespace KokkosKernels {
+namespace KokkosSparse {
 
 enum SparseMatrixFormat {
   BlockCRS,
@@ -72,7 +72,7 @@ namespace Impl {
 template <typename in_row_view_t, typename in_nnz_view_t,
           typename in_val_view_t, typename out_row_view_t,
           typename out_nnz_view_t, typename out_val_view_t>
-void kk_create_blockcrs_formated_point_crsmatrix(
+void kk_create_blockcrs_formatted_point_crsmatrix(
     int block_size, size_t num_rows, size_t num_cols, in_row_view_t in_xadj,
     in_nnz_view_t in_adj, in_val_view_t in_vals,
 
@@ -293,17 +293,17 @@ struct TransposeMatrix {
   struct CountTag {};
   struct FillTag {};
 
-  typedef Kokkos::TeamPolicy<CountTag, MyExecSpace> team_count_policy_t;
-  typedef Kokkos::TeamPolicy<FillTag, MyExecSpace> team_fill_policy_t;
+  using team_count_policy_t = Kokkos::TeamPolicy<CountTag, MyExecSpace>;
+  using team_fill_policy_t  = Kokkos::TeamPolicy<FillTag, MyExecSpace>;
 
-  typedef typename team_count_policy_t::member_type team_count_member_t;
-  typedef typename team_fill_policy_t::member_type team_fill_member_t;
+  using team_count_member_t = typename team_count_policy_t::member_type;
+  using team_fill_member_t  = typename team_fill_policy_t::member_type;
 
-  typedef typename in_nnz_view_t::non_const_value_type nnz_lno_t;
-  typedef typename in_row_view_t::non_const_value_type size_type;
+  using nnz_lno_t = typename in_nnz_view_t::non_const_value_type;
+  using size_type = typename in_row_view_t::non_const_value_type;
 
-  typename in_nnz_view_t::non_const_value_type num_rows;
-  typename in_nnz_view_t::non_const_value_type num_cols;
+  nnz_lno_t num_rows;
+  nnz_lno_t num_cols;
   in_row_view_t xadj;
   in_nnz_view_t adj;
   in_scalar_view_t vals;
@@ -425,11 +425,12 @@ void transpose_matrix(
 
   // determine vector lanes per thread
   int thread_size = kk_get_suggested_vector_size(
-      num_rows, nnz, kk_get_exec_space_type<MyExecSpace>());
+      num_rows, nnz,
+      KokkosKernels::Impl::kk_get_exec_space_type<MyExecSpace>());
 
   // determine threads per team
   int team_size = kk_get_suggested_team_size(
-      thread_size, kk_get_exec_space_type<MyExecSpace>());
+      thread_size, KokkosKernels::Impl::kk_get_exec_space_type<MyExecSpace>());
 
   TransposeFunctor_t tm(num_rows, num_cols, xadj, adj, vals, t_xadj, t_adj,
                         t_vals, tmp_row_view, true, team_size);
@@ -439,8 +440,9 @@ void transpose_matrix(
                                   team_size, thread_size),
                        tm);
 
-  kk_exclusive_parallel_prefix_sum<out_row_view_t, MyExecSpace>(num_cols + 1,
-                                                                t_xadj);
+  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<out_row_view_t,
+                                                        MyExecSpace>(
+      num_cols + 1, t_xadj);
 
   Kokkos::deep_copy(tmp_row_view, t_xadj);
 
@@ -508,11 +510,12 @@ void transpose_graph(
 
   // determine vector lanes per thread
   int thread_size = kk_get_suggested_vector_size(
-      num_rows, nnz, kk_get_exec_space_type<MyExecSpace>());
+      num_rows, nnz,
+      KokkosKernels::Impl::kk_get_exec_space_type<MyExecSpace>());
 
   // determine threads per team
   int team_size = kk_get_suggested_team_size(
-      thread_size, kk_get_exec_space_type<MyExecSpace>());
+      thread_size, KokkosKernels::Impl::kk_get_exec_space_type<MyExecSpace>());
 
   TransposeFunctor_t tm(num_rows, num_cols, xadj, adj, tmp1, t_xadj, t_adj,
                         tmp2, tmp_row_view, false, team_size);
@@ -522,8 +525,9 @@ void transpose_graph(
                                   team_size, thread_size),
                        tm);
 
-  kk_exclusive_parallel_prefix_sum<out_row_view_t, MyExecSpace>(num_cols + 1,
-                                                                t_xadj);
+  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<out_row_view_t,
+                                                        MyExecSpace>(
+      num_cols + 1, t_xadj);
 
   Kokkos::deep_copy(tmp_row_view, t_xadj);
 
@@ -535,6 +539,116 @@ void transpose_graph(
   MyExecSpace().fence();
 }
 
+template <typename in_row_view_t, typename in_nnz_view_t,
+          typename in_scalar_view_t, typename out_row_view_t,
+          typename out_nnz_view_t, typename out_scalar_view_t>
+struct TransposeBsrMatrix {
+  using ordinal_type = typename in_nnz_view_t::non_const_value_type;
+  using size_type    = typename in_row_view_t::non_const_value_type;
+
+  int block_size;
+  in_row_view_t Arow_map;
+  in_nnz_view_t Aentries;
+  in_scalar_view_t Avalues;
+  out_row_view_t tArow_map;    // allocated
+  out_nnz_view_t tAentries;    // allocated
+  out_scalar_view_t tAvalues;  // allocated
+
+  TransposeBsrMatrix(const int blockSize, in_row_view_t row_mapA,
+                     in_nnz_view_t entriesA, in_scalar_view_t valuesA,
+                     out_row_view_t row_mapAt, out_nnz_view_t entriesAt,
+                     out_scalar_view_t valuesAt)
+      : block_size(blockSize),
+        Arow_map(row_mapA),
+        Aentries(entriesA),
+        Avalues(valuesA),
+        tArow_map(row_mapAt),
+        tAentries(entriesAt),
+        tAvalues(valuesAt){};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int tArowIdx) const {
+    // Loop over entries in row
+    for (size_type tAentryIdx = tArow_map(tArowIdx);
+         tAentryIdx < tArow_map(tArowIdx + 1); ++tAentryIdx) {
+      ordinal_type tAcolIdx = tAentries(tAentryIdx);
+
+      // we have block tA(tArowIdx, tAcolIdx) starting at tAvalues(entryIdx)
+      // we need to find AentryIdx corresponding to A(tAcolIdx, tArowIdx)
+      size_type AentryIdx;
+      for (AentryIdx = Arow_map(tAcolIdx); AentryIdx < Arow_map(tAcolIdx + 1);
+           ++AentryIdx) {
+        if (tArowIdx == Aentries(AentryIdx)) break;
+      }
+
+      // we loop over block_size*block_size Avalues starting at AentryIdx
+      // and store them into tAvalues in transpose order starting at tAentryIdx
+      for (int i = 0; i < block_size; ++i) {
+        for (int j = 0; j < block_size; ++j) {
+          tAvalues(tAentryIdx * block_size * block_size + i * block_size + j) =
+              Avalues(AentryIdx * block_size * block_size + j * block_size + i);
+        }
+      }
+    }
+  }
+};  // TransposeBsrMatrix
+
+template <typename in_row_view_t, typename in_nnz_view_t,
+          typename in_scalar_view_t, typename out_row_view_t,
+          typename out_nnz_view_t, typename out_scalar_view_t,
+          typename MyExecSpace>
+void transpose_bsr_matrix(
+    typename in_nnz_view_t::non_const_value_type num_rows,
+    typename in_nnz_view_t::non_const_value_type num_cols, const int block_size,
+    in_row_view_t xadj, in_nnz_view_t adj, in_scalar_view_t vals,
+    out_row_view_t t_xadj,    // pre-allocated -- initialized with 0
+    out_nnz_view_t t_adj,     // pre-allocated -- no need for initialize
+    out_scalar_view_t t_vals  // pre-allocated -- no need for initialize
+) {
+  using TransposeBsrFunctor_type =
+      TransposeBsrMatrix<in_row_view_t, in_nnz_view_t, in_scalar_view_t,
+                         out_row_view_t, out_nnz_view_t, out_scalar_view_t>;
+
+  // Step 1: call transpose_graph of bsr matrix
+  transpose_graph<in_row_view_t, in_nnz_view_t, out_row_view_t, out_nnz_view_t,
+                  out_row_view_t, MyExecSpace>(num_rows, num_cols, xadj, adj,
+                                               t_xadj, t_adj);
+
+  // Step 2: transpose the values of A
+  Kokkos::RangePolicy<MyExecSpace> my_policy(0, num_cols);
+  TransposeBsrFunctor_type my_functor(block_size, xadj, adj, vals, t_xadj,
+                                      t_adj, t_vals);
+
+  Kokkos::parallel_for(my_policy, my_functor);
+  MyExecSpace().fence();
+}
+
+template <typename bsrMat_t>
+bsrMat_t transpose_bsr_matrix(const bsrMat_t &A) {
+  // Allocate views and call the other version of transpose_matrix
+  using c_rowmap_t  = typename bsrMat_t::row_map_type;
+  using c_entries_t = typename bsrMat_t::index_type;
+  using c_values_t  = typename bsrMat_t::values_type;
+  using rowmap_t    = typename bsrMat_t::row_map_type::non_const_type;
+  using entries_t   = typename bsrMat_t::index_type::non_const_type;
+  using values_t    = typename bsrMat_t::values_type::non_const_type;
+
+  rowmap_t AT_rowmap("Transpose rowmap", A.numCols() + 1);
+  entries_t AT_entries(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Transpose entries"),
+      A.nnz());
+  values_t AT_values(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Transpose values"),
+      A.nnz() * A.blockDim() * A.blockDim());
+  transpose_bsr_matrix<c_rowmap_t, c_entries_t, c_values_t, rowmap_t, entries_t,
+                       values_t, typename bsrMat_t::execution_space>(
+      A.numRows(), A.numCols(), A.blockDim(), A.graph.row_map, A.graph.entries,
+      A.values, AT_rowmap, AT_entries, AT_values);
+  // And construct the transpose crsMat_t
+  return bsrMat_t("Transpose", A.numCols(), A.numRows(), A.nnz(), AT_values,
+                  AT_rowmap, AT_entries, A.blockDim());
+}
+
 template <typename forward_map_type, typename reverse_map_type>
 struct Fill_Reverse_Scale_Functor {
   struct CountTag {};
@@ -715,7 +829,8 @@ void kk_create_reverse_map(
 
     // kk_inclusive_parallel_prefix_sum<reverse_array_type,
     // MyExecSpace>(tmp_reverse_size + 1, tmp_color_xadj);
-    kk_exclusive_parallel_prefix_sum<reverse_array_type, MyExecSpace>(
+    KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<reverse_array_type,
+                                                          MyExecSpace>(
         tmp_reverse_size + 1, tmp_color_xadj);
     MyExecSpace().fence();
 
@@ -750,7 +865,8 @@ void kk_create_reverse_map(
 
     // kk_inclusive_parallel_prefix_sum<reverse_array_type,
     // MyExecSpace>(num_reverse_elements + 1, reverse_map_xadj);
-    kk_exclusive_parallel_prefix_sum<reverse_array_type, MyExecSpace>(
+    KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<reverse_array_type,
+                                                          MyExecSpace>(
         num_reverse_elements + 1, tmp_color_xadj);
     MyExecSpace().fence();
 
@@ -843,7 +959,8 @@ inline size_t kk_is_d1_coloring_valid(
     typename in_nnz_view_t::non_const_value_type num_rows,
     typename in_nnz_view_t::non_const_value_type /*num_cols*/,
     in_row_view_t xadj, in_nnz_view_t adj, in_color_view_t v_colors) {
-  ExecSpaceType my_exec_space = kk_get_exec_space_type<MyExecSpace>();
+  KokkosKernels::Impl::ExecSpaceType my_exec_space =
+      KokkosKernels::Impl::kk_get_exec_space_type<MyExecSpace>();
   int vector_size =
       kk_get_suggested_vector_size(num_rows, adj.extent(0), my_exec_space);
   int suggested_team_size =
@@ -926,160 +1043,6 @@ void graph_min_max_degree(const rowmap_t &rowmap, ordinal_t &min_degree,
   max_degree = result.max_val;
 }
 
-/*
-template <typename in_row_view_t,
-          typename in_nnz_view_t,
-          typename out_nnz_view_t,
-          typename MyExecSpace>
-struct IncidenceMatrix{
-
-  struct FillTag{};
-
-  typedef struct FillTag FillTag;
-
-  typedef Kokkos::TeamPolicy<FillTag, MyExecSpace> team_fill_policy_t ;
-  typedef Kokkos::TeamPolicy<FillTag, MyExecSpace,
-Kokkos::Schedule<Kokkos::Dynamic> > dynamic_team_fill_policy_t ; typedef
-typename team_fill_policy_t::member_type team_fill_member_t ;
-
-  typedef typename in_nnz_view_t::non_const_value_type nnz_lno_t;
-  typedef typename in_row_view_t::non_const_value_type size_type;
-
-
-  typename in_nnz_view_t::non_const_value_type num_rows;
-  in_row_view_t xadj;
-  in_nnz_view_t adj;
-  out_nnz_view_t t_adj;  //allocated
-  typename in_row_view_t::non_const_type tmp_txadj;
-  nnz_lno_t team_work_size;
-
-  IncidenceMatrix(
-      nnz_lno_t num_rows_,
-      in_row_view_t xadj_,
-      in_nnz_view_t adj_,
-      out_nnz_view_t t_adj_,
-      typename in_row_view_t::non_const_type tmp_txadj_,
-      nnz_lno_t team_row_work_size_):
-        num_rows(num_rows_),
-        xadj(xadj_), adj(adj_),
-        t_adj(t_adj_),
-        tmp_txadj(tmp_txadj_), team_work_size(team_row_work_size_) {}
-
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const FillTag&, const team_fill_member_t & teamMember) const {
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin +
-team_work_size, num_rows);
-
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,team_row_begin,team_row_end),
-[&] (const nnz_lno_t& row_index) { const size_type col_begin = xadj[row_index];
-      const size_type col_end = xadj[row_index + 1];
-      const nnz_lno_t left_work = col_end - col_begin;
-      Kokkos::parallel_for(
-          Kokkos::ThreadVectorRange(teamMember, left_work),
-          [&] (nnz_lno_t i) {
-        const size_type adjind = i + col_begin;
-        const nnz_lno_t colIndex = adj[adjind];
-        if (row_index < colIndex){
-
-          const size_type pos =
-Kokkos::atomic_fetch_add(&(tmp_txadj(colIndex)),1); t_adj(adjind) = adjind;
-          t_adj(pos) = adjind;
-        }
-      });
-    //}
-    });
-  }
-};
-*/
-/**
- * \brief function returns transpose of the given graph.
- * \param num_rows: num rows in input graph
- * \param num_cols: num cols in input graph
- * \param xadj: row pointers of the input graph
- * \param adj: column indices of the input graph
- * \param t_xadj: output, the row indices of the output graph. MUST BE
- * INITIALIZED WITH ZEROES. \param t_adj: output, column indices. No need for
- * initializations. \param vector_size: suggested vector size, optional. if -1,
- * kernel will decide. \param suggested_team_size: suggested team size,
- * optional. if -1, kernel will decide. \param team_work_chunk_size: suggested
- * work size of a team, optional. if -1, kernel will decide. \param
- * use_dynamic_scheduling: whether to use dynamic scheduling. Default is true.
- */
-/*
-template <typename in_row_view_t,
-          typename in_nnz_view_t,
-          typename out_nnz_view_t,
-          typename MyExecSpace>
-inline void kk_create_incidence_matrix(
-    typename in_nnz_view_t::non_const_value_type num_rows,
-    in_row_view_t xadj,
-    in_nnz_view_t adj,
-    out_nnz_view_t i_adj,  //pre-allocated -- no need for initialize -- size is
-same as adj int vector_size = -1, int suggested_team_size = -1, typename
-in_nnz_view_t::non_const_value_type team_work_chunk_size = -1, bool
-use_dynamic_scheduling = true
-    ){
-
-
-  typedef typename in_row_view_t::non_const_type tmp_row_view_t;
-  //allocate some memory for work for row pointers
-  tmp_row_view_t tmp_row_view(Kokkos::view_alloc(Kokkos::WithoutInitializing,
-"tmp_row_view"), num_rows + 1);
-
-  Kokkos::deep_copy(tmp_row_view, xadj);
-
-  in_nnz_view_t tmp1;
-  out_nnz_view_t tmp2;
-
-  //create the functor for tranpose.
-  typedef IncidenceMatrix <
-      in_row_view_t, in_nnz_view_t, in_nnz_view_t,
-      out_nnz_view_t, MyExecSpace>  IncidenceMatrix_Functor_t;
-
-  IncidenceMatrix_Functor_t tm ( num_rows, xadj, adj,
-                                t_adj, tmp_row_view,
-                                false,
-                                team_work_chunk_size);
-
-
-  typedef typename IncidenceMatrix_Functor_t::team_fill_policy_t fill_tp_t;
-  typedef typename IncidenceMatrix_Functor_t::dynamic_team_fill_policy_t
-d_fill_tp_t;
-
-  typename in_row_view_t::non_const_value_type nnz = adj.extent(0);
-
-  //set the vector size, if not suggested.
-  if (vector_size == -1)
-    vector_size = kk_get_suggested_vector_size(num_rows, nnz,
-kk_get_exec_space_type<MyExecSpace>());
-
-  //set the team size, if not suggested.
-  if (suggested_team_size == -1)
-    suggested_team_size = kk_get_suggested_team_size(vector_size,
-kk_get_exec_space_type<MyExecSpace>());
-
-  //set the chunk size, if not suggested.
-  if (team_work_chunk_size == -1)
-    team_work_chunk_size = suggested_team_size;
-
-
-
-  if (use_dynamic_scheduling){
-    Kokkos::parallel_for(  fill_tp_t(num_rows  / team_work_chunk_size + 1 ,
-suggested_team_size, vector_size), tm);
-  }
-  else {
-    Kokkos::parallel_for(  d_fill_tp_t(num_rows  / team_work_chunk_size + 1 ,
-suggested_team_size, vector_size), tm);
-  }
-  MyExecSpace().fence();
-
-}
-*/
-
 template <typename size_type, typename lno_t>
 void kk_get_lower_triangle_count_sequential(const lno_t nv,
                                             const size_type *in_xadj,
@@ -1140,7 +1103,7 @@ struct LowerTriangularMatrix {
   scalar_t *t_vals;
 
   const lno_t team_work_size;
-  const ExecSpaceType exec_space;
+  const KokkosKernels::Impl::ExecSpaceType exec_space;
   const bool is_lower;
 
   LowerTriangularMatrix(const lno_t num_rows_, const size_type *xadj_,
@@ -1157,7 +1120,8 @@ struct LowerTriangularMatrix {
         t_adj(t_adj_),
         t_vals(out_vals_),
         team_work_size(team_row_work_size_),
-        exec_space(kk_get_exec_space_type<ExecutionSpace>()),
+        exec_space(
+            KokkosKernels::Impl::kk_get_exec_space_type<ExecutionSpace>()),
         is_lower(is_lower_) {}
 
   KOKKOS_INLINE_FUNCTION
@@ -1274,9 +1238,10 @@ void kk_get_lower_triangle_count_parallel(
     bool use_dynamic_scheduling = false, int chunksize = 4,
     bool is_lower = true) {
   const int vector_size = kk_get_suggested_vector_size(
-      nv, ne, kk_get_exec_space_type<ExecutionSpace>());
+      nv, ne, KokkosKernels::Impl::kk_get_exec_space_type<ExecutionSpace>());
   const int suggested_team_size = kk_get_suggested_team_size(
-      vector_size, kk_get_exec_space_type<ExecutionSpace>());
+      vector_size,
+      KokkosKernels::Impl::kk_get_exec_space_type<ExecutionSpace>());
   const int team_work_chunk_size = suggested_team_size * chunksize;
   typedef LowerTriangularMatrix<size_type, lno_t, ExecutionSpace> ltm_t;
 
@@ -1439,9 +1404,10 @@ void kk_get_lower_triangle_fill_parallel(
     bool use_dynamic_scheduling = false, bool chunksize = 4,
     bool is_lower = true) {
   const int vector_size = kk_get_suggested_vector_size(
-      nv, ne, kk_get_exec_space_type<ExecutionSpace>());
+      nv, ne, KokkosKernels::Impl::kk_get_exec_space_type<ExecutionSpace>());
   const int suggested_team_size = kk_get_suggested_team_size(
-      vector_size, kk_get_exec_space_type<ExecutionSpace>());
+      vector_size,
+      KokkosKernels::Impl::kk_get_exec_space_type<ExecutionSpace>());
   const int team_work_chunk_size = suggested_team_size * chunksize;
 
   typedef LowerTriangularMatrix<size_type, lno_t, ExecutionSpace, scalar_t>
@@ -1573,8 +1539,9 @@ crstmat_t kk_get_lower_triangle(
       nr, ne, rowmap, entries, new_row_map.data(), new_indices,
       use_dynamic_scheduling, chunksize);
 
-  kk_exclusive_parallel_prefix_sum<row_map_view_t, exec_space>(nr + 1,
-                                                               new_row_map);
+  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<row_map_view_t,
+                                                        exec_space>(
+      nr + 1, new_row_map);
   exec_space().fence();
 
   auto ll_size   = Kokkos::subview(new_row_map, nr);
@@ -1630,8 +1597,9 @@ crstmat_t kk_get_lower_crs_matrix(
       nr, ne, rowmap, entries, new_row_map.data(), new_indices,
       use_dynamic_scheduling, chunksize);
 
-  kk_exclusive_parallel_prefix_sum<row_map_view_t, exec_space>(nr + 1,
-                                                               new_row_map);
+  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<row_map_view_t,
+                                                        exec_space>(
+      nr + 1, new_row_map);
   exec_space().fence();
 
   auto ll_size   = Kokkos::subview(new_row_map, nr);
@@ -1683,8 +1651,9 @@ graph_t kk_get_lower_crs_graph(graph_t in_crs_matrix,
   kk_get_lower_triangle_count<size_type, lno_t, exec_space>(
       nr, ne, rowmap, entries, new_row_map.data(), new_indices);
 
-  kk_exclusive_parallel_prefix_sum<row_map_view_t, exec_space>(nr + 1,
-                                                               new_row_map);
+  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<row_map_view_t,
+                                                        exec_space>(
+      nr + 1, new_row_map);
   exec_space().fence();
 
   auto ll_size   = Kokkos::subview(new_row_map, nr);
@@ -1736,8 +1705,9 @@ void kk_get_lower_triangle(typename cols_view_t::non_const_value_type nr,
       nr, ne, rowmap, entries, out_rowmap.data(), new_indices.data(),
       use_dynamic_scheduling, chunksize, is_lower);
 
-  kk_exclusive_parallel_prefix_sum<out_row_map_view_t, exec_space>(nr + 1,
-                                                                   out_rowmap);
+  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<out_row_map_view_t,
+                                                        exec_space>(nr + 1,
+                                                                    out_rowmap);
   exec_space().fence();
 
   auto ll_size   = Kokkos::subview(out_rowmap, nr);
@@ -1844,8 +1814,9 @@ void kk_create_incidence_matrix_from_original_matrix(
       permutation.data(), use_dynamic_scheduling, chunksize,
       sort_decreasing_order);
   exec_space().fence();
-  kk_exclusive_parallel_prefix_sum<out_row_map_view_t, exec_space>(nr + 1,
-                                                                   out_rowmap);
+  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<out_row_map_view_t,
+                                                        exec_space>(nr + 1,
+                                                                    out_rowmap);
 
   // kk_print_1Dview(out_rowmap, false, 20);
 
@@ -2069,21 +2040,21 @@ template <typename scalar_t, typename lno_t, typename device,
           typename mem_traits, typename size_type>
 struct MatrixTraits<
     KokkosSparse::CrsMatrix<scalar_t, lno_t, device, mem_traits, size_type>> {
-  static constexpr auto format = KokkosKernels::CRS;
+  static constexpr auto format = KokkosSparse::CRS;
 };
 
 template <typename scalar_t, typename lno_t, typename device,
           typename mem_traits, typename size_type>
 struct MatrixTraits<KokkosSparse::Experimental::BlockCrsMatrix<
     scalar_t, lno_t, device, mem_traits, size_type>> {
-  static constexpr auto format = KokkosKernels::BlockCRS;
+  static constexpr auto format = KokkosSparse::BlockCRS;
 };
 
 template <typename scalar_t, typename lno_t, typename device,
           typename mem_traits, typename size_type>
 struct MatrixTraits<KokkosSparse::Experimental::BsrMatrix<
     scalar_t, lno_t, device, mem_traits, size_type>> {
-  static constexpr auto format = KokkosKernels::BSR;
+  static constexpr auto format = KokkosSparse::BSR;
 };
 
 template <SparseMatrixFormat /* outFormat */>
@@ -2097,7 +2068,7 @@ struct MatrixConverter<BlockCRS> {
           KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>,
       typename blockCrsMat_t = KokkosSparse::Experimental::BlockCrsMatrix<
           scalar_t, lno_t, device, void, size_type>>
-  static blockCrsMat_t from_blockcrs_formated_point_crsmatrix(
+  static blockCrsMat_t from_blockcrs_formatted_point_crsmatrix(
       const KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
           &mtx,
       lno_t block_size) {
@@ -2111,7 +2082,7 @@ struct MatrixConverter<BSR> {
             typename device,
             typename bsrMtx_t = KokkosSparse::Experimental::BsrMatrix<
                 scalar_t, lno_t, device, void, size_type>>
-  static bsrMtx_t from_blockcrs_formated_point_crsmatrix(
+  static bsrMtx_t from_blockcrs_formatted_point_crsmatrix(
       const KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
           &mtx,
       lno_t block_size) {
@@ -2120,6 +2091,17 @@ struct MatrixConverter<BSR> {
 };
 
 }  // namespace Impl
+}  // namespace KokkosSparse
+
+namespace KokkosKernels {
+
+enum [[deprecated]] SparseMatrixFormat{
+    BlockCRS, BSR,
+    CRS = BlockCRS,  // convenience alias: for block_size=1 or no-blocks there
+                     // is no difference in value ordering (so the format tag
+                     // becomes irrelevant)
+};
+
 }  // namespace KokkosKernels
 
 #endif
diff --git a/src/common/KokkosKernels_SparseUtils_cusparse.hpp b/src/sparse/KokkosSparse_Utils_cusparse.hpp
similarity index 64%
rename from src/common/KokkosKernels_SparseUtils_cusparse.hpp
rename to src/sparse/KokkosSparse_Utils_cusparse.hpp
index ea9bfd37dd..6e9eee5ab5 100644
--- a/src/common/KokkosKernels_SparseUtils_cusparse.hpp
+++ b/src/sparse/KokkosSparse_Utils_cusparse.hpp
@@ -114,6 +114,83 @@ inline void cusparse_internal_safe_call(cusparseStatus_t cusparseStatus,
   KokkosSparse::Impl::cusparse_internal_safe_call(call, #call, __FILE__, \
                                                   __LINE__)
 
+template <typename T>
+cudaDataType cuda_data_type_from() {
+  // compile-time failure with a nice message if called on an unsupported type
+  static_assert(!std::is_same<T, T>::value,
+                "cuSparse TPL does not support scalar type");
+  // static_assert(false, ...) is allowed to error even if the code is not
+  // instantiated. obfuscate the predicate Despite this function being
+  // uncompilable, the compiler may decide that a return statement is missing,
+  // so throw to silence that
+  throw std::logic_error("unreachable throw after static_assert");
+}
+
+/* If half_t is not float, need to define a conversion for both
+   otherwise, conversion for half_t IS conversion for float
+*/
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+template <>
+inline cudaDataType cuda_data_type_from<Kokkos::Experimental::half_t>() {
+  return CUDA_R_16F;  // Kokkos half_t is a half
+}
+#endif
+// half_t is defined to be float, so this works for both half_t and float when
+// half_t is float
+template <>
+inline cudaDataType cuda_data_type_from<float>() {
+  return CUDA_R_32F;  // Kokkos half_t is a float
+}
+template <>
+inline cudaDataType cuda_data_type_from<double>() {
+  return CUDA_R_64F;
+}
+template <>
+inline cudaDataType cuda_data_type_from<Kokkos::complex<float>>() {
+  return CUDA_C_32F;
+}
+template <>
+inline cudaDataType cuda_data_type_from<Kokkos::complex<double>>() {
+  return CUDA_C_32F;
+}
+
+#if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION)
+
+template <typename T>
+cusparseIndexType_t cusparse_index_type_t_from() {
+#define AS_STR_LITERAL_IMPL_(x) #x
+#define AS_STR_LITERAL(x) AS_STR_LITERAL_IMPL_(x)
+  static_assert(!std::is_same<T, T>::value,
+                "cuSparse " AS_STR_LITERAL(
+                    CUSPARSE_VERSION) " TPL does not support index type");
+  // static_assert(false, ...) is allowed to error even if the code is not
+  // instantiated. obfuscate the predicate Despite this function being
+  // uncompilable, the compiler may decide that a return statement is missing,
+  // so throw to silence that
+  throw std::logic_error("unreachable throw after static_assert");
+#undef AS_STR_LITERAL_IMPL_
+#undef AS_STR_LITERAL
+}
+
+template <>
+inline cusparseIndexType_t cusparse_index_type_t_from<int>() {
+  return CUSPARSE_INDEX_32I;
+}
+template <>
+inline cusparseIndexType_t cusparse_index_type_t_from<int64_t>() {
+  return CUSPARSE_INDEX_64I;
+}
+// Currently no CUSPARSE_INDEX_64U but this will work most of the time
+template <>
+inline cusparseIndexType_t cusparse_index_type_t_from<size_t>() {
+  return CUSPARSE_INDEX_64I;
+}
+template <>
+inline cusparseIndexType_t cusparse_index_type_t_from<unsigned short>() {
+  return CUSPARSE_INDEX_16U;
+}
+#endif
+
 }  // namespace Impl
 
 }  // namespace KokkosSparse
diff --git a/src/sparse/KokkosSparse_Utils_mkl.hpp b/src/sparse/KokkosSparse_Utils_mkl.hpp
new file mode 100644
index 0000000000..b9eb3a9bd2
--- /dev/null
+++ b/src/sparse/KokkosSparse_Utils_mkl.hpp
@@ -0,0 +1,259 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP
+#define _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP
+
+#include "KokkosKernels_config.h"
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+
+#include <mkl.h>
+
+namespace KokkosSparse {
+namespace Impl {
+
+inline void mkl_internal_safe_call(sparse_status_t mkl_status, const char *name,
+                                   const char *file = nullptr,
+                                   const int line   = 0) {
+  if (SPARSE_STATUS_SUCCESS != mkl_status) {
+    std::ostringstream oss;
+    oss << "MKL call \"" << name << "\" at " << file << ":" << line
+        << " encountered error: ";
+    switch (mkl_status) {
+      case SPARSE_STATUS_NOT_INITIALIZED:
+        oss << "SPARSE_STATUS_NOT_INITIALIZED (empty handle or matrix arrays)";
+        break;
+      case SPARSE_STATUS_ALLOC_FAILED:
+        oss << "SPARSE_STATUS_ALLOC_FAILED (internal error: memory allocation "
+               "failed)";
+        break;
+      case SPARSE_STATUS_INVALID_VALUE:
+        oss << "SPARSE_STATUS_INVALID_VALUE (invalid input value)";
+        break;
+      case SPARSE_STATUS_EXECUTION_FAILED:
+        oss << "SPARSE_STATUS_EXECUTION_FAILED (e.g. 0-diagonal element for "
+               "triangular solver)";
+        break;
+      case SPARSE_STATUS_INTERNAL_ERROR:
+        oss << "SPARSE_STATUS_INTERNAL_ERROR";
+        break;
+      case SPARSE_STATUS_NOT_SUPPORTED:
+        oss << "SPARSE_STATUS_NOT_SUPPORTED (e.g. operation for double "
+               "precision doesn't support other types)";
+        break;
+      default: oss << "unknown (code " << (int)mkl_status << ")"; break;
+    }
+    oss << '\n';
+    Kokkos::abort(oss.str().c_str());
+  }
+}
+
+#define KOKKOSKERNELS_MKL_SAFE_CALL(call) \
+  KokkosSparse::Impl::mkl_internal_safe_call(call, #call, __FILE__, __LINE__)
+
+inline sparse_operation_t mode_kk_to_mkl(char mode_kk) {
+  switch (toupper(mode_kk)) {
+    case 'N': return SPARSE_OPERATION_NON_TRANSPOSE;
+    case 'T': return SPARSE_OPERATION_TRANSPOSE;
+    case 'H': return SPARSE_OPERATION_CONJUGATE_TRANSPOSE;
+    default:;
+  }
+  throw std::invalid_argument(
+      "Invalid mode for MKL (should be one of N, T, H)");
+}
+
+template <typename value_type>
+struct mkl_is_supported_value_type : std::false_type {};
+
+template <>
+struct mkl_is_supported_value_type<float> : std::true_type {};
+template <>
+struct mkl_is_supported_value_type<double> : std::true_type {};
+template <>
+struct mkl_is_supported_value_type<Kokkos::complex<float>> : std::true_type {};
+template <>
+struct mkl_is_supported_value_type<Kokkos::complex<double>> : std::true_type {};
+
+// MKLSparseMatrix provides thin wrapper around MKL matrix handle
+// (sparse_matrix_t) and encapsulates MKL call dispatches related to details
+// like value_type, allowing simple client code in kernels.
+template <typename value_type>
+class MKLSparseMatrix {
+  sparse_matrix_t mtx;
+
+  static_assert(mkl_is_supported_value_type<value_type>::value,
+                "Scalar type used in MKLSparseMatrix<value_type> is NOT "
+                "supported by MKL");
+
+ public:
+  inline MKLSparseMatrix(sparse_matrix_t mtx_) : mtx(mtx_) {}
+
+  // Constructs MKL sparse matrix from KK sparse views (m rows x n cols)
+  inline MKLSparseMatrix(const MKL_INT num_rows, const MKL_INT num_cols,
+                         MKL_INT *xadj, MKL_INT *adj, value_type *values);
+
+  // Allows using MKLSparseMatrix directly in MKL calls
+  inline operator sparse_matrix_t() const { return mtx; }
+
+  // Exports MKL sparse matrix contents into KK views
+  inline void export_data(MKL_INT &num_rows, MKL_INT &num_cols,
+                          MKL_INT *&rows_start, MKL_INT *&columns,
+                          value_type *&values);
+
+  inline void destroy() {
+    KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(mtx));
+  }
+};
+
+template <>
+inline MKLSparseMatrix<float>::MKLSparseMatrix(const MKL_INT rows,
+                                               const MKL_INT cols,
+                                               MKL_INT *xadj, MKL_INT *adj,
+                                               float *values) {
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_csr(
+      &mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, xadj, xadj + 1, adj, values));
+}
+
+template <>
+inline MKLSparseMatrix<double>::MKLSparseMatrix(const MKL_INT rows,
+                                                const MKL_INT cols,
+                                                MKL_INT *xadj, MKL_INT *adj,
+                                                double *values) {
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr(
+      &mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, xadj, xadj + 1, adj, values));
+}
+
+template <>
+inline MKLSparseMatrix<Kokkos::complex<float>>::MKLSparseMatrix(
+    const MKL_INT rows, const MKL_INT cols, MKL_INT *xadj, MKL_INT *adj,
+    Kokkos::complex<float> *values) {
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_csr(
+      &mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, xadj, xadj + 1, adj,
+      reinterpret_cast<MKL_Complex8 *>(values)));
+}
+
+template <>
+inline MKLSparseMatrix<Kokkos::complex<double>>::MKLSparseMatrix(
+    const MKL_INT rows, const MKL_INT cols, MKL_INT *xadj, MKL_INT *adj,
+    Kokkos::complex<double> *values) {
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_csr(
+      &mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, xadj, xadj + 1, adj,
+      reinterpret_cast<MKL_Complex16 *>(values)));
+}
+
+template <>
+inline void MKLSparseMatrix<float>::export_data(MKL_INT &num_rows,
+                                                MKL_INT &num_cols,
+                                                MKL_INT *&rows_start,
+                                                MKL_INT *&columns,
+                                                float *&values) {
+  sparse_index_base_t indexing;
+  MKL_INT *rows_end;
+  KOKKOSKERNELS_MKL_SAFE_CALL(
+      mkl_sparse_s_export_csr(mtx, &indexing, &num_rows, &num_cols, &rows_start,
+                              &rows_end, &columns, &values));
+  if (SPARSE_INDEX_BASE_ZERO != indexing) {
+    throw std::runtime_error(
+        "Expected zero based indexing in exported MKL sparse matrix\n");
+    return;
+  }
+}
+
+template <>
+inline void MKLSparseMatrix<double>::export_data(MKL_INT &num_rows,
+                                                 MKL_INT &num_cols,
+                                                 MKL_INT *&rows_start,
+                                                 MKL_INT *&columns,
+                                                 double *&values) {
+  sparse_index_base_t indexing;
+  MKL_INT *rows_end;
+  KOKKOSKERNELS_MKL_SAFE_CALL(
+      mkl_sparse_d_export_csr(mtx, &indexing, &num_rows, &num_cols, &rows_start,
+                              &rows_end, &columns, &values));
+  if (SPARSE_INDEX_BASE_ZERO != indexing) {
+    throw std::runtime_error(
+        "Expected zero based indexing in exported MKL sparse matrix\n");
+    return;
+  }
+}
+
+template <>
+inline void MKLSparseMatrix<Kokkos::complex<float>>::export_data(
+    MKL_INT &num_rows, MKL_INT &num_cols, MKL_INT *&rows_start,
+    MKL_INT *&columns, Kokkos::complex<float> *&values) {
+  sparse_index_base_t indexing;
+  MKL_INT *rows_end;
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_export_csr(
+      mtx, &indexing, &num_rows, &num_cols, &rows_start, &rows_end, &columns,
+      reinterpret_cast<MKL_Complex8 **>(&values)));
+  if (SPARSE_INDEX_BASE_ZERO != indexing) {
+    throw std::runtime_error(
+        "Expected zero based indexing in exported MKL sparse matrix\n");
+    return;
+  }
+}
+
+template <>
+inline void MKLSparseMatrix<Kokkos::complex<double>>::export_data(
+    MKL_INT &num_rows, MKL_INT &num_cols, MKL_INT *&rows_start,
+    MKL_INT *&columns, Kokkos::complex<double> *&values) {
+  sparse_index_base_t indexing;
+  MKL_INT *rows_end;
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_export_csr(
+      mtx, &indexing, &num_rows, &num_cols, &rows_start, &rows_end, &columns,
+      reinterpret_cast<MKL_Complex16 **>(&values)));
+  if (SPARSE_INDEX_BASE_ZERO != indexing) {
+    throw std::runtime_error(
+        "Expected zero based indexing in exported MKL sparse matrix\n");
+    return;
+  }
+}
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#endif  // KOKKOSKERNELS_ENABLE_TPL_MKL
+
+#endif  // _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP
\ No newline at end of file
diff --git a/src/common/KokkosKernels_SparseUtils_rocsparse.hpp b/src/sparse/KokkosSparse_Utils_rocsparse.hpp
similarity index 100%
rename from src/common/KokkosKernels_SparseUtils_rocsparse.hpp
rename to src/sparse/KokkosSparse_Utils_rocsparse.hpp
diff --git a/src/sparse/KokkosSparse_csc2csr.hpp b/src/sparse/KokkosSparse_csc2csr.hpp
new file mode 100644
index 0000000000..32f0c2b745
--- /dev/null
+++ b/src/sparse/KokkosSparse_csc2csr.hpp
@@ -0,0 +1,250 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include "KokkosKernels_Utils.hpp"
+#include <Kokkos_StdAlgorithms.hpp>
+
+#ifndef _KOKKOSSPARSE_CSC2CSR_HPP
+#define _KOKKOSSPARSE_CSC2CSR_HPP
+namespace KokkosSparse {
+namespace Impl {
+template <class OrdinalType, class SizeType, class ValViewType,
+          class RowIdViewType, class ColMapViewType>
+class Csc2Csr {
+ private:
+  using CrsST             = typename ValViewType::value_type;
+  using CrsOT             = OrdinalType;
+  using CrsET             = typename ValViewType::execution_space;
+  using CrsMT             = void;
+  using CrsSzT            = SizeType;
+  using CrsType           = CrsMatrix<CrsST, CrsOT, CrsET, CrsMT, CrsSzT>;
+  using CrsValsViewType   = typename CrsType::values_type;
+  using CrsRowMapViewType = typename CrsType::row_map_type::non_const_type;
+  using CrsColIdViewType  = typename CrsType::index_type;
+
+  OrdinalType __nrows;
+  OrdinalType __ncols;
+  SizeType __nnz;
+  ValViewType __vals;
+  RowIdViewType __row_ids;
+  ColMapViewType __col_map;
+
+  RowIdViewType __crs_row_cnt;
+
+  CrsValsViewType __crs_vals;
+  CrsRowMapViewType __crs_row_map;
+  CrsRowMapViewType __crs_row_map_scratch;
+  CrsColIdViewType __crs_col_ids;
+
+ public:
+  struct AlgoTags {
+    struct s1RowCnt {};
+    struct s2RowMap {};
+    struct s3Copy {};
+  };
+
+  using s1RowCntTag = typename AlgoTags::s1RowCnt;
+  using s3CopyTag   = typename AlgoTags::s3Copy;
+
+ private:
+  using TeamPolicyType = Kokkos::TeamPolicy<s3CopyTag, CrsET>;
+
+  int __suggested_team_size, __suggested_vec_size, __league_size;
+
+  template <class FunctorType>
+  void __run(FunctorType &functor) {
+    // s1RowCntTag
+    {
+      Kokkos::parallel_for("Csc2Csr",
+                           Kokkos::RangePolicy<s1RowCntTag, CrsET>(0, __nnz),
+                           functor);
+      CrsET().fence();
+    }
+    // s2RowMapTag
+    {
+      namespace KE = Kokkos::Experimental;
+      CrsET crsET;
+      // Use exclusive scan so we can allocate the row map uninitialized and
+      // avoid accessing device views on the host.
+      KE::exclusive_scan(crsET, KE::cbegin(__crs_row_cnt),
+                         KE::cend(__crs_row_cnt), KE::begin(__crs_row_map), 0);
+      CrsET().fence();
+      Kokkos::deep_copy(__crs_row_map_scratch, __crs_row_map);
+      CrsET().fence();
+    }
+    // s3CopyTag
+    {
+      TeamPolicyType teamPolicy(__ncols, __suggested_team_size,
+                                __suggested_vec_size);
+      Kokkos::parallel_for("Csc2Csr", teamPolicy, functor);
+      CrsET().fence();
+    }
+    // TODO: s3CopySortCompressTag
+  }
+
+ public:
+  template <class MemberType>
+  class __Functor {
+   private:
+    OrdinalType __nrows;
+    OrdinalType __ncols;
+    SizeType __nnz;
+    ValViewType __vals;
+    CrsValsViewType __crs_vals;
+    RowIdViewType __row_ids;
+    CrsRowMapViewType __crs_row_map;
+    CrsRowMapViewType __crs_row_map_scratch;
+    ColMapViewType __col_map;
+    CrsColIdViewType __crs_col_ids;
+    RowIdViewType __crs_row_cnt;
+
+   public:
+    __Functor(OrdinalType nrows, OrdinalType ncols, SizeType nnz,
+              ValViewType vals, CrsValsViewType crs_vals, RowIdViewType row_ids,
+              CrsRowMapViewType crs_row_map,
+              CrsRowMapViewType crs_row_map_scratch, ColMapViewType col_map,
+              CrsColIdViewType crs_col_ids, RowIdViewType crs_row_cnt)
+        : __nrows(nrows),
+          __ncols(ncols),
+          __nnz(nnz),
+          __vals(vals),
+          __crs_vals(crs_vals),
+          __row_ids(row_ids),
+          __crs_row_map(crs_row_map),
+          __crs_row_map_scratch(crs_row_map_scratch),
+          __col_map(col_map),
+          __crs_col_ids(crs_col_ids),
+          __crs_row_cnt(crs_row_cnt){};
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const s3CopyTag &, const MemberType &member) const {
+      auto j         = member.league_rank();
+      auto col_start = __col_map(j);
+      auto col_len   = __col_map(j + 1) - col_start;
+
+      Kokkos::parallel_for(
+          Kokkos::TeamVectorRange(member, 0, col_len), [&](const int &k) {
+            auto idx = col_start + k;
+            auto i   = __row_ids(idx);
+            auto crs_idx =
+                Kokkos::atomic_fetch_inc(&__crs_row_map_scratch.data()[i]);
+            __crs_col_ids(crs_idx) = j;
+            __crs_vals(crs_idx)    = __vals(idx);
+          });
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const s1RowCntTag &, const int &thread_id) const {
+      Kokkos::atomic_inc(&__crs_row_cnt.data()[__row_ids(thread_id)]);
+    }
+  };
+
+  Csc2Csr(OrdinalType nrows, OrdinalType ncols, SizeType nnz, ValViewType vals,
+          RowIdViewType row_ids, ColMapViewType col_map, int league_size = 2)
+      : __nrows(nrows),
+        __ncols(ncols),
+        __nnz(nnz),
+        __vals(vals),
+        __row_ids(row_ids),
+        __col_map(col_map),
+        __league_size(league_size) {
+    __crs_vals = CrsValsViewType(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "__crs_vals"), nnz);
+    __crs_row_map = CrsRowMapViewType(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "__crs_row_map"),
+        nrows + 1);
+    __crs_row_map_scratch =
+        CrsRowMapViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                                             "__crs_row_map_scratch"),
+                          nrows + 1);
+    __crs_col_ids = CrsColIdViewType(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "__crs_col_ids"), nnz);
+
+    __crs_row_cnt = RowIdViewType("__crs_row_cnt", __nrows + 1);
+
+    __Functor<typename TeamPolicyType::member_type> functor(
+        __nrows, __ncols, __nnz, __vals, __crs_vals, __row_ids, __crs_row_map,
+        __crs_row_map_scratch, __col_map, __crs_col_ids, __crs_row_cnt);
+
+    KokkosKernels::Impl::get_suggested_vector_size<int64_t, CrsET>(
+        __suggested_vec_size, __nrows, __nnz);
+    __suggested_team_size =
+        KokkosKernels::Impl::get_suggested_team_size<TeamPolicyType>(
+            functor, __suggested_vec_size);
+
+    __run(functor);
+  }
+
+  CrsType get_csrMat() {
+    return CrsType("csc2csr", __nrows, __ncols, __nnz, __crs_vals,
+                   __crs_row_map, __crs_col_ids);
+  }
+};
+}  // namespace Impl
+///
+/// \brief Converts a csc matrix to a CrsMatrix.
+/// \tparam OrdinalType The view value type associated with the RowIdViewType
+/// \tparam SizeType The type of nnz
+/// \tparam ValViewType The values view type
+/// \tparam RowIdViewType The row ids view type
+/// \tparam ColMapViewType The column map view type
+/// \param nrows The number of rows in the csc matrix
+/// \param ncols The number of columns in the csc matrix
+/// \param nnz The number of non-zeros in the csc matrix
+/// \param vals The values view of the csc matrix
+/// \param row_ids The row ids view of the csc matrix
+/// \param col_map The column map view of the csc matrix
+/// \return A KokkosSparse::CrsMatrix.
+template <class OrdinalType, class SizeType, class ValViewType,
+          class RowIdViewType, class ColMapViewType>
+auto csc2csr(OrdinalType nrows, OrdinalType ncols, SizeType nnz,
+             ValViewType vals, RowIdViewType row_ids, ColMapViewType col_map,
+             int league_size) {
+  using Csc2csrType = Impl::Csc2Csr<OrdinalType, SizeType, ValViewType,
+                                    RowIdViewType, ColMapViewType>;
+  Csc2csrType csc2Csr(nrows, ncols, nnz, vals, row_ids, col_map, league_size);
+  return csc2Csr.get_csrMat();
+}
+}  // namespace KokkosSparse
+#endif  //  _KOKKOSSPARSE_CSC2CSR_HPP
diff --git a/src/sparse/KokkosSparse_gauss_seidel.hpp b/src/sparse/KokkosSparse_gauss_seidel.hpp
index efe70dd1c5..1df960860b 100644
--- a/src/sparse/KokkosSparse_gauss_seidel.hpp
+++ b/src/sparse/KokkosSparse_gauss_seidel.hpp
@@ -132,7 +132,7 @@ void block_gauss_seidel_symbolic(
                         is_graph_symmetric);
 }
 
-template <KokkosKernels::SparseMatrixFormat format = KokkosKernels::CRS,
+template <KokkosSparse::SparseMatrixFormat format = KokkosSparse::CRS,
           typename KernelHandle, typename lno_row_view_t_,
           typename lno_nnz_view_t_, typename scalar_nnz_view_t_>
 void gauss_seidel_numeric(KernelHandle *handle,
@@ -207,7 +207,7 @@ void gauss_seidel_numeric(KernelHandle *handle,
                                                           is_graph_symmetric);
 }
 
-template <KokkosKernels::SparseMatrixFormat format = KokkosKernels::CRS,
+template <KokkosSparse::SparseMatrixFormat format = KokkosSparse::CRS,
           typename KernelHandle, typename lno_row_view_t_,
           typename lno_nnz_view_t_, typename scalar_nnz_view_t_>
 void gauss_seidel_numeric(KernelHandle *handle,
@@ -286,7 +286,7 @@ void gauss_seidel_numeric(KernelHandle *handle,
                                                           is_graph_symmetric);
 }
 
-template <KokkosKernels::SparseMatrixFormat format = KokkosKernels::BlockCRS,
+template <KokkosSparse::SparseMatrixFormat format = KokkosSparse::BlockCRS,
           typename KernelHandle, typename lno_row_view_t_,
           typename lno_nnz_view_t_, typename scalar_nnz_view_t_>
 void block_gauss_seidel_numeric(
@@ -307,7 +307,7 @@ void block_gauss_seidel_numeric(
                                values, is_graph_symmetric);
 }
 
-template <KokkosKernels::SparseMatrixFormat format = KokkosKernels::CRS,
+template <KokkosSparse::SparseMatrixFormat format = KokkosSparse::CRS,
           typename KernelHandle, typename lno_row_view_t_,
           typename lno_nnz_view_t_, typename scalar_nnz_view_t_,
           typename x_scalar_view_t, typename y_scalar_view_t>
@@ -437,7 +437,7 @@ void symmetric_gauss_seidel_apply(
                          update_y_vector, omega, numIter, true, true);
 }
 
-template <KokkosKernels::SparseMatrixFormat format = KokkosKernels::BlockCRS,
+template <KokkosSparse::SparseMatrixFormat format = KokkosSparse::BlockCRS,
           typename KernelHandle, typename lno_row_view_t_,
           typename lno_nnz_view_t_, typename scalar_nnz_view_t_,
           typename x_scalar_view_t, typename y_scalar_view_t>
@@ -471,7 +471,7 @@ void symmetric_block_gauss_seidel_apply(
       handle, num_rows, num_cols, row_map, entries, values, x_lhs_output_vec,
       y_rhs_input_vec, init_zero_x_vector, update_y_vector, omega, numIter);
 }
-template <KokkosKernels::SparseMatrixFormat format = KokkosKernels::CRS,
+template <KokkosSparse::SparseMatrixFormat format = KokkosSparse::CRS,
           class KernelHandle, typename lno_row_view_t_,
           typename lno_nnz_view_t_, typename scalar_nnz_view_t_,
           typename x_scalar_view_t, typename y_scalar_view_t>
@@ -603,7 +603,7 @@ void forward_sweep_gauss_seidel_apply(
                          update_y_vector, omega, numIter, true, false);
 }
 
-template <KokkosKernels::SparseMatrixFormat format = KokkosKernels::BlockCRS,
+template <KokkosSparse::SparseMatrixFormat format = KokkosSparse::BlockCRS,
           typename KernelHandle, typename lno_row_view_t_,
           typename lno_nnz_view_t_, typename scalar_nnz_view_t_,
           typename x_scalar_view_t, typename y_scalar_view_t>
@@ -637,7 +637,7 @@ void forward_sweep_block_gauss_seidel_apply(
       handle, num_rows, num_cols, row_map, entries, values, x_lhs_output_vec,
       y_rhs_input_vec, init_zero_x_vector, update_y_vector, omega, numIter);
 }
-template <KokkosKernels::SparseMatrixFormat format = KokkosKernels::CRS,
+template <KokkosSparse::SparseMatrixFormat format = KokkosSparse::CRS,
           class KernelHandle, typename lno_row_view_t_,
           typename lno_nnz_view_t_, typename scalar_nnz_view_t_,
           typename x_scalar_view_t, typename y_scalar_view_t>
@@ -769,7 +769,7 @@ void backward_sweep_gauss_seidel_apply(
                          update_y_vector, omega, numIter, false, true);
 }
 
-template <KokkosKernels::SparseMatrixFormat format = KokkosKernels::BlockCRS,
+template <KokkosSparse::SparseMatrixFormat format = KokkosSparse::BlockCRS,
           typename KernelHandle, typename lno_row_view_t_,
           typename lno_nnz_view_t_, typename scalar_nnz_view_t_,
           typename x_scalar_view_t, typename y_scalar_view_t>
diff --git a/src/sparse/KokkosSparse_spadd.hpp b/src/sparse/KokkosSparse_spadd.hpp
index 6db63455be..38bead14de 100644
--- a/src/sparse/KokkosSparse_spadd.hpp
+++ b/src/sparse/KokkosSparse_spadd.hpp
@@ -46,460 +46,13 @@
 #define _KOKKOS_SPADD_HPP
 
 #include "KokkosKernels_Handle.hpp"
-#include "KokkosKernels_Sorting.hpp"
-#include "Kokkos_ArithTraits.hpp"
+#include "KokkosKernels_helpers.hpp"
+#include "KokkosSparse_spadd_symbolic_spec.hpp"
+#include "KokkosSparse_spadd_numeric_spec.hpp"
 
 namespace KokkosSparse {
 namespace Experimental {
 
-/*
-Unsorted symbolic algorithm notes:
--Only needs to sort and merge indices once, in symbolic (sorting is expensive)
--Can't afford to allocate dense Views for indices/values (assume number of
-columns is very large) -Want numeric() to know exactly where each A/B entry
-belongs in Ccolinds/Cvalues -To accomplish all of these, symbolic() computes
-arrays Apos and Bpos (both are type clno_nnz_view_t_, and have same length as
-a_entries and b_entries respectively) -Apos/Bpos are saved in the handle -Apos
-and Bpos each contain the final index within C row where the A/B entry belongs
--See UnsortedNumericSumFunctor below for the usage of Apos/Bpos
-*/
-
-// Helper macro to check that two types are the same (ignoring const)
-#define SAME_TYPE(A, B)                             \
-  std::is_same<typename std::remove_const<A>::type, \
-               typename std::remove_const<B>::type>::value
-
-// get C rowmap for sorted input
-template <typename size_type, typename ordinal_type, typename ARowPtrsT,
-          typename BRowPtrsT, typename AColIndsT, typename BColIndsT,
-          typename CRowPtrsT, typename ExecSpace>
-struct SortedCountEntriesRange {
-  SortedCountEntriesRange(ordinal_type nrows_,
-                          const typename ARowPtrsT::const_type& Arowptrs_,
-                          const AColIndsT& Acolinds_,
-                          const typename BRowPtrsT::const_type& Browptrs_,
-                          const BColIndsT& Bcolinds_,
-                          const CRowPtrsT& Crowcounts_)
-      : nrows(nrows_),
-        Arowptrs(Arowptrs_),
-        Acolinds(Acolinds_),
-        Browptrs(Browptrs_),
-        Bcolinds(Bcolinds_),
-        Crowcounts(Crowcounts_) {}
-
-  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const {
-    const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits<ordinal_type>::max();
-
-    // count the union of nonzeros in Arow and Brow
-    size_type numEntries = 0;
-    size_type ai         = 0;
-    size_type bi         = 0;
-    size_type Arowstart  = Arowptrs(i);
-    size_type Arowlen    = Arowptrs(i + 1) - Arowstart;
-    size_type Browstart  = Browptrs(i);
-    size_type Browlen    = Browptrs(i + 1) - Browstart;
-    ordinal_type Acol    = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart);
-    ordinal_type Bcol    = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart);
-    while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) {
-      ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol;
-      numEntries++;
-      // Eat all entries in both A and B which have this column
-      // This also results in Acol/Bcol being updated to following entries for
-      // next loop iter
-      while (Acol == Ccol)
-        Acol = (ai == Arowlen) ? ORDINAL_MAX : Acolinds(Arowstart + ai++);
-      while (Bcol == Ccol)
-        Bcol = (bi == Browlen) ? ORDINAL_MAX : Bcolinds(Browstart + bi++);
-    }
-    Crowcounts(i) = numEntries;
-  }
-
-  ordinal_type nrows;
-  const typename ARowPtrsT::const_type Arowptrs;
-  const AColIndsT Acolinds;
-  const typename BRowPtrsT::const_type Browptrs;
-  const BColIndsT Bcolinds;
-  CRowPtrsT Crowcounts;
-};
-
-template <typename size_type, typename ordinal_type, typename ARowPtrsT,
-          typename BRowPtrsT, typename AColIndsT, typename BColIndsT,
-          typename CRowPtrsT, typename ExecSpace>
-struct SortedCountEntriesTeam {
-  SortedCountEntriesTeam(ordinal_type nrows_,
-                         const typename ARowPtrsT::const_type& Arowptrs_,
-                         const AColIndsT& Acolinds_,
-                         const typename BRowPtrsT::const_type& Browptrs_,
-                         const BColIndsT& Bcolinds_,
-                         const CRowPtrsT& Crowcounts_)
-      : nrows(nrows_),
-        Arowptrs(Arowptrs_),
-        Acolinds(Acolinds_),
-        Browptrs(Browptrs_),
-        Bcolinds(Bcolinds_),
-        Crowcounts(Crowcounts_) {}
-
-  using TeamPol = Kokkos::TeamPolicy<ExecSpace>;
-  using TeamMem = typename TeamPol::member_type;
-
-  KOKKOS_INLINE_FUNCTION void longRowFallback(const ordinal_type i) const {
-    const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits<ordinal_type>::max();
-
-    // count the union of nonzeros in Arow and Brow
-    size_type numEntries = 0;
-    size_type ai         = 0;
-    size_type bi         = 0;
-    size_type Arowstart  = Arowptrs(i);
-    size_type Arowlen    = Arowptrs(i + 1) - Arowstart;
-    size_type Browstart  = Browptrs(i);
-    size_type Browlen    = Browptrs(i + 1) - Browstart;
-    ordinal_type Acol    = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart);
-    ordinal_type Bcol    = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart);
-    while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) {
-      ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol;
-      numEntries++;
-      // Eat all entries in both A and B which have this column
-      // This also results in Acol/Bcol being updated to following entries for
-      // next loop iter
-      while (Acol == Ccol)
-        Acol = (ai == Arowlen) ? ORDINAL_MAX : Acolinds(Arowstart + ai++);
-      while (Bcol == Ccol)
-        Bcol = (bi == Browlen) ? ORDINAL_MAX : Bcolinds(Browstart + bi++);
-    }
-    Crowcounts(i) = numEntries;
-  }
-
-  KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const {
-    ordinal_type i = t.league_rank() * t.team_size() + t.team_rank();
-    if (i >= nrows) return;
-    ordinal_type* allScratch =
-        (ordinal_type*)t.team_shmem().get_shmem(totalShared);
-    ordinal_type* scratch  = allScratch + t.team_rank() * sharedPerThread;
-    ordinal_type Arowstart = Arowptrs(i);
-    ordinal_type Arowlen   = Arowptrs(i + 1) - Arowstart;
-    ordinal_type Browstart = Browptrs(i);
-    ordinal_type Browlen   = Browptrs(i + 1) - Browstart;
-    ordinal_type n         = Arowlen + Browlen;
-    if (n > sharedPerThread) {
-      // fall back to slow serial method
-      Kokkos::single(Kokkos::PerThread(t), [&]() { longRowFallback(i); });
-      return;
-    }
-    if (n == 0) {
-      Kokkos::single(Kokkos::PerThread(t), [&]() { Crowcounts(i) = 0; });
-      return;
-    }
-    // Figure out the number of bitonic steps: ceil(log2(n))
-    ordinal_type npot   = 1;
-    ordinal_type levels = 0;
-    while (npot < n) {
-      levels++;
-      npot <<= 1;
-    }
-    // Copy A and B entries to scratch
-    Kokkos::parallel_for(
-        Kokkos::ThreadVectorRange(t, Arowlen),
-        [&](ordinal_type j) { scratch[j] = Acolinds(Arowstart + j); });
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, Browlen),
-                         [&](ordinal_type j) {
-                           scratch[npot - 1 - j] = Bcolinds(Browstart + j);
-                         });
-    // Fill space between A and B with ORDINAL_MAX,
-    // to maintain a valid bitonic sequence of power-of-two length
-    Kokkos::parallel_for(
-        Kokkos::ThreadVectorRange(t, npot - n), [&](ordinal_type j) {
-          scratch[Arowlen + j] = Kokkos::ArithTraits<ordinal_type>::max();
-        });
-    // npot = 2^levels
-    for (ordinal_type level = 0; level < levels; level++) {
-      // npot/2 pairs of items are compared in parallel
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, npot >> 1),
-                           [&](const ordinal_type j) {
-                             ordinal_type boxSize = npot >> level;
-                             // Which box contains this thread?
-                             // box = (j / boxSize), and boxSize =
-                             // 2^(levels-level), so box = j * 2^(level-levels)
-                             // = j >> (levels - level)
-                             ordinal_type boxID = (j * 2) >> (levels - level);
-                             // boxStart = boxID * boxSize = boxID *
-                             // 2^(levels-level) = boxID << (levels-level)
-                             ordinal_type boxStart  = boxID << (levels - level);
-                             ordinal_type boxOffset = j - boxID * boxSize / 2;
-                             ordinal_type elem1     = boxStart + boxOffset;
-                             ordinal_type elem2     = elem1 + (boxSize >> 1);
-                             if (scratch[elem2] < scratch[elem1]) {
-                               ordinal_type temp = scratch[elem1];
-                               scratch[elem1]    = scratch[elem2];
-                               scratch[elem2]    = temp;
-                             }
-                           });
-    }
-    // Finally, count the number of distinct entries (this is #rising edges + 1)
-    ordinal_type risingEdges;
-    Kokkos::parallel_reduce(
-        Kokkos::ThreadVectorRange(t, n - 1),
-        [&](const ordinal_type j, ordinal_type& lcount) {
-          if (scratch[j] != scratch[j + 1]) lcount++;
-        },
-        risingEdges);
-    Kokkos::single(Kokkos::PerThread(t),
-                   [&]() { Crowcounts(i) = risingEdges + 1; });
-  }
-
-  size_t team_shmem_size(int teamSize) const {
-    return sharedPerThread * sizeof(ordinal_type) * teamSize;
-  }
-
-  ordinal_type nrows;
-  const typename ARowPtrsT::const_type Arowptrs;
-  const AColIndsT Acolinds;
-  const typename BRowPtrsT::const_type Browptrs;
-  const BColIndsT Bcolinds;
-  CRowPtrsT Crowcounts;
-  int sharedPerThread;  // Shared for each thread, measured in
-                        // sizeof(ordinal_type)
-  int totalShared;      // Shared for whole team, measured in bytes
-};
-
-// get upper bound for C entries per row (assumes worst case, that entries in A
-// and B on each row are disjoint)
-template <typename size_type, typename ordinal_type, typename ARowPtrsT,
-          typename BRowPtrsT, typename CRowPtrsT>
-struct UnsortedEntriesUpperBound {
-  UnsortedEntriesUpperBound(ordinal_type nrows_,
-                            const typename ARowPtrsT::const_type& Arowptrs_,
-                            const typename BRowPtrsT::const_type& Browptrs_,
-                            const CRowPtrsT& Crowcounts_)
-      : nrows(nrows_),
-        Arowptrs(Arowptrs_),
-        Browptrs(Browptrs_),
-        Crowcounts(Crowcounts_) {}
-  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const {
-    Crowcounts(i) =
-        (Arowptrs(i + 1) - Arowptrs(i)) + (Browptrs(i + 1) - Browptrs(i));
-    if (i == nrows - 1) {
-      // last workitem also zeros the one-past-end entry of row counts, so
-      // that prefix sum is correct
-      Crowcounts(nrows) = 0;
-    }
-  }
-  ordinal_type nrows;
-  const typename ARowPtrsT::const_type Arowptrs;
-  const typename BRowPtrsT::const_type Browptrs;
-  CRowPtrsT Crowcounts;
-};
-
-// Unsorted symbolic: new functors:
-//  -compute uncompressed C (entries only, no values)
-//  -sort uncompressed C entries within row, while permuting A union B
-//  permutation array -compress sorted C entries and A,B perm arrays at the same
-//  time, which produces Crowcounts value
-// Inputs: A, B rowptrs/colinds, C uncompressed rowptrs (and allocated C
-// entries) Output: C uncompressed colinds
-template <typename size_type, typename ordinal_type, typename ArowptrsT,
-          typename BrowptrsT, typename CrowptrsT, typename AcolindsT,
-          typename BcolindsT, typename CcolindsT>
-struct UnmergedSumFunctor {
-  UnmergedSumFunctor(ordinal_type nrows_, const ArowptrsT& Arowptrs_,
-                     const AcolindsT& Acolinds_, const BrowptrsT& Browptrs_,
-                     const BcolindsT& Bcolinds_, const CrowptrsT& Crowptrs_,
-                     const CcolindsT& Ccolinds_, const CcolindsT& ABperm_)
-      : nrows(nrows_),
-        Arowptrs(Arowptrs_),
-        Acolinds(Acolinds_),
-        Browptrs(Browptrs_),
-        Bcolinds(Bcolinds_),
-        Crowptrs(Crowptrs_),
-        Ccolinds(Ccolinds_),
-        ABperm(ABperm_) {}
-  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const {
-    size_type inserted  = 0;
-    size_type crowstart = Crowptrs(i);
-    size_type arowstart = Arowptrs(i);
-    size_type arowlen   = Arowptrs(i + 1) - arowstart;
-    size_type browstart = Browptrs(i);
-    size_type browlen   = Browptrs(i + 1) - browstart;
-    // Insert all A entries, then all B entries
-    for (size_type j = 0; j < arowlen; j++) {
-      Ccolinds(crowstart + inserted) = Acolinds(arowstart + j);
-      ABperm(crowstart + inserted)   = j;
-      inserted++;
-    }
-    for (size_type j = 0; j < browlen; j++) {
-      Ccolinds(crowstart + inserted) = Bcolinds(browstart + j);
-      // tell A and B permutation values apart by adding arowlen as a bias to B
-      // values
-      ABperm(crowstart + inserted) = j + arowlen;
-      inserted++;
-    }
-  }
-  ordinal_type nrows;
-  const ArowptrsT Arowptrs;
-  const AcolindsT Acolinds;
-  const BrowptrsT Browptrs;
-  const BcolindsT Bcolinds;
-  const CrowptrsT Crowptrs;
-  CcolindsT Ccolinds;
-  CcolindsT ABperm;
-};
-
-template <typename size_type, typename ordinal_type, typename ArowptrsT,
-          typename BrowptrsT, typename CrowptrsT, typename CcolindsT>
-struct MergeEntriesFunctor {
-  MergeEntriesFunctor(ordinal_type nrows_, const ArowptrsT& Arowptrs_,
-                      const BrowptrsT& Browptrs_, const CrowptrsT& Crowptrs_,
-                      const CrowptrsT& Crowcounts_, const CcolindsT& Ccolinds_,
-                      const CcolindsT& ABperm_, const CcolindsT& Apos_,
-                      const CcolindsT& Bpos_)
-      : nrows(nrows_),
-        Arowptrs(Arowptrs_),
-        Browptrs(Browptrs_),
-        Crowptrs(Crowptrs_),
-        Crowcounts(Crowcounts_),
-        Ccolinds(Ccolinds_),
-        ABperm(ABperm_),
-        Apos(Apos_),
-        Bpos(Bpos_) {}
-  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const {
-    size_type CrowStart = Crowptrs(i);
-    size_type CrowEnd   = Crowptrs(i + 1);
-    if (CrowEnd == CrowStart) {
-      Crowcounts(i) = 0;
-      return;
-    }
-    size_type ArowStart = Arowptrs(i);
-    size_type ArowNum   = Arowptrs(i + 1) - ArowStart;
-    size_type BrowStart = Browptrs(i);
-    ordinal_type CFit   = 0;  // counting through merged C indices (within row)
-    for (size_type Cit = CrowStart; Cit < CrowEnd; Cit++) {
-      if ((Cit > CrowStart) && (Ccolinds(Cit) != Ccolinds(Cit - 1))) {
-        // This is a different column than the previous entry, and is not the
-        // first entry. This means that this is the first occurence of a unique
-        // column.
-        CFit++;
-      }
-      size_type permVal = ABperm(Cit);
-      if (permVal < ArowNum) {
-        // Entry belongs to A
-        ordinal_type Aindex = permVal;
-        // The Aindex'th entry in row i of A will be added into the CFit'th
-        // entry in C
-        Apos(ArowStart + Aindex) = CFit;
-      } else {
-        // Entry belongs to B
-        ordinal_type Bindex = permVal - ArowNum;
-        // The Bindex'th entry in row i of B will be added into the CFit'th
-        // entry in C
-        Bpos(BrowStart + Bindex) = CFit;
-      }
-    }
-    // At end of the row, know how many entries are in merged C.
-    // Right now, CFit is the index of the last Apos/Bpos,
-    // so adding one gives the total number of entries.
-    Crowcounts(i) = CFit + 1;
-  }
-  ordinal_type nrows;
-  const ArowptrsT Arowptrs;
-  const BrowptrsT Browptrs;
-  const CrowptrsT Crowptrs;
-  CrowptrsT Crowcounts;
-  CcolindsT Ccolinds;
-  const CcolindsT ABperm;
-  CcolindsT Apos;
-  CcolindsT Bpos;
-};
-
-// Run SortedCountEntries: non-GPU, always uses the RangePolicy version.
-template <typename KernelHandle, typename alno_row_view_t_,
-          typename alno_nnz_view_t_, typename blno_row_view_t_,
-          typename blno_nnz_view_t_, typename clno_row_view_t_>
-void runSortedCountEntries(
-    const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries,
-    const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries,
-    const clno_row_view_t_& c_rowmap,
-    typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<
-        typename KernelHandle::SPADDHandleType::execution_space>()>::type* =
-        nullptr) {
-  using size_type    = typename KernelHandle::size_type;
-  using ordinal_type = typename KernelHandle::nnz_lno_t;
-  using execution_space =
-      typename KernelHandle::SPADDHandleType::execution_space;
-  using range_type = Kokkos::RangePolicy<execution_space>;
-  auto nrows       = c_rowmap.extent(0) - 1;
-  SortedCountEntriesRange<size_type, ordinal_type, alno_row_view_t_,
-                          blno_row_view_t_, alno_nnz_view_t_, blno_nnz_view_t_,
-                          clno_row_view_t_, execution_space>
-      countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap);
-  Kokkos::parallel_for(
-      "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries",
-      range_type(0, nrows), countEntries);
-}
-
-// Run SortedCountEntries: GPU, uses the TeamPolicy or RangePolicy depending
-//  on average nz per row (a runtime decision)
-template <typename KernelHandle, typename alno_row_view_t_,
-          typename alno_nnz_view_t_, typename blno_row_view_t_,
-          typename blno_nnz_view_t_, typename clno_row_view_t_>
-void runSortedCountEntries(
-    const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries,
-    const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries,
-    const clno_row_view_t_& c_rowmap,
-    typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<
-        typename KernelHandle::SPADDHandleType::execution_space>()>::type* =
-        nullptr) {
-  using size_type    = typename KernelHandle::size_type;
-  using ordinal_type = typename KernelHandle::nnz_lno_t;
-  using execution_space =
-      typename KernelHandle::SPADDHandleType::execution_space;
-  using RangePol = Kokkos::RangePolicy<execution_space>;
-  using TeamPol  = Kokkos::TeamPolicy<execution_space>;
-  auto nrows     = c_rowmap.extent(0) - 1;
-  size_type c_est_nnz =
-      1.4 * (a_entries.extent(0) + b_entries.extent(0)) / nrows;
-  if (c_est_nnz <= 512) {
-    // Convert c_est_nnz to a power of 2
-    size_type pot_est_nnz = 1;
-    while (pot_est_nnz < c_est_nnz) pot_est_nnz *= 2;
-    // Estimate max number of uncompressed entries in each row of C
-    int vector_length = 1;
-    int vector_length_max =
-        KokkosKernels::Impl::kk_get_max_vector_size<execution_space>();
-    while (vector_length * 2 <= vector_length_max &&
-           (size_type)vector_length * 2 <= pot_est_nnz) {
-      vector_length *= 2;
-    }
-    SortedCountEntriesTeam<size_type, ordinal_type, alno_row_view_t_,
-                           blno_row_view_t_, alno_nnz_view_t_, blno_nnz_view_t_,
-                           clno_row_view_t_, execution_space>
-        countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap);
-    countEntries.sharedPerThread = pot_est_nnz;
-    // compute largest possible team size
-    TeamPol testPolicy(1, 1, vector_length);
-    testPolicy.set_scratch_size(
-        0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type)));
-    int team_size = testPolicy.team_size_recommended(countEntries,
-                                                     Kokkos::ParallelForTag());
-    // construct real policy
-    int league_size = (nrows + team_size - 1) / team_size;
-    TeamPol policy(league_size, team_size, vector_length);
-    policy.set_scratch_size(
-        0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type)));
-    countEntries.totalShared =
-        countEntries.sharedPerThread * team_size * sizeof(ordinal_type);
-    Kokkos::parallel_for(
-        "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", policy,
-        countEntries);
-  } else {
-    SortedCountEntriesRange<size_type, ordinal_type, alno_row_view_t_,
-                            blno_row_view_t_, alno_nnz_view_t_,
-                            blno_nnz_view_t_, clno_row_view_t_, execution_space>
-        countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap);
-    Kokkos::parallel_for(
-        "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries",
-        RangePol(0, nrows), countEntries);
-  }
-}
-
 // Symbolic: count entries in each row in C to produce rowmap
 // kernel handle has information about whether it is sorted add or not.
 template <typename KernelHandle, typename alno_row_view_t_,
@@ -513,288 +66,63 @@ void spadd_symbolic(
     clno_row_view_t_ c_rowmap)  // c_rowmap must already be allocated (doesn't
                                 // need to be initialized)
 {
-  typedef
-      typename KernelHandle::SPADDHandleType::execution_space execution_space;
-  typedef typename KernelHandle::size_type size_type;
-  typedef typename KernelHandle::nnz_lno_t ordinal_type;
-  // Check that A/B/C data types match KernelHandle types, and that C data types
-  // are nonconst (doesn't matter if A/B types are const)
-  static_assert(
-      SAME_TYPE(typename alno_row_view_t_::non_const_value_type, size_type),
-      "add_symbolic: A size_type must match KernelHandle size_type (const "
-      "doesn't matter)");
-  static_assert(
-      SAME_TYPE(typename blno_row_view_t_::non_const_value_type, size_type),
-      "add_symbolic: B size_type must match KernelHandle size_type (const "
-      "doesn't matter)");
-  static_assert(
-      SAME_TYPE(typename clno_row_view_t_::non_const_value_type, size_type),
-      "add_symbolic: C size_type must match KernelHandle size_type)");
-  static_assert(std::is_same<typename clno_row_view_t_::non_const_value_type,
-                             typename clno_row_view_t_::value_type>::value,
-                "add_symbolic: C size_type must not be const");
-  static_assert(
-      SAME_TYPE(typename alno_nnz_view_t_::non_const_value_type, ordinal_type),
-      "add_symbolic: A entry type must match KernelHandle entry type (aka "
-      "nnz_lno_t, and const doesn't matter)");
-  static_assert(
-      SAME_TYPE(typename blno_nnz_view_t_::non_const_value_type, ordinal_type),
-      "add_symbolic: B entry type must match KernelHandle entry type (aka "
-      "nnz_lno_t, and const doesn't matter)");
-  static_assert(
-      SAME_TYPE(typename clno_nnz_view_t_::non_const_value_type, ordinal_type),
-      "add_symbolic: C entry type must match KernelHandle entry type (aka "
-      "nnz_lno_t)");
-  static_assert(std::is_same<typename clno_row_view_t_::non_const_value_type,
-                             typename clno_row_view_t_::value_type>::value,
-                "add_symbolic: C entry type must not be const");
-  // symbolic just needs to compute c_rowmap
-  // easy for sorted, but for unsorted is easiest to just compute the whole sum
-  auto addHandle = handle->get_spadd_handle();
-  if (a_rowmap.extent(0) == 0 || a_rowmap.extent(0) == 1) {
-    // Have 0 rows, so nothing to do except set #nnz to 0
-    addHandle->set_c_nnz(0);
-    // If c_rowmap has a single entry, it must be 0
-    if (c_rowmap.extent(0)) Kokkos::deep_copy(c_rowmap, (size_type)0);
-    addHandle->set_call_symbolic();
-    return;
-  }
-  ordinal_type nrows = a_rowmap.extent(0) - 1;
-  typedef Kokkos::RangePolicy<execution_space, ordinal_type> range_type;
-  if (addHandle->is_input_sorted()) {
-    runSortedCountEntries<KernelHandle, alno_row_view_t_, alno_nnz_view_t_,
-                          blno_row_view_t_, blno_nnz_view_t_, clno_row_view_t_>(
-        a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap);
-    KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<clno_row_view_t_,
-                                                          execution_space>(
-        nrows + 1, c_rowmap);
-  } else {
-    // note: scoping individual parts of the process to free views sooner,
-    // minimizing peak memory usage run the unsorted c_rowmap upper bound
-    // functor (just adds together A and B entry counts row by row)
-    clno_row_view_t_ c_rowmap_upperbound(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "C row counts upper bound"),
-        nrows + 1);
-    size_type c_nnz_upperbound = 0;
-    {
-      UnsortedEntriesUpperBound<size_type, ordinal_type, alno_row_view_t_,
-                                blno_row_view_t_, clno_row_view_t_>
-          countEntries(nrows, a_rowmap, b_rowmap, c_rowmap_upperbound);
-      Kokkos::parallel_for(
-          "KokkosSparse::SpAdd:Symbolic::InputNotSorted::CountEntries",
-          range_type(0, nrows), countEntries);
-      KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<clno_row_view_t_,
-                                                            execution_space>(
-          nrows + 1, c_rowmap_upperbound);
-      Kokkos::deep_copy(c_nnz_upperbound,
-                        Kokkos::subview(c_rowmap_upperbound, nrows));
-    }
-    clno_nnz_view_t_ c_entries_uncompressed(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "C entries uncompressed"),
-        c_nnz_upperbound);
-    clno_nnz_view_t_ ab_perm(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                           "A and B permuted entry indices"),
-        c_nnz_upperbound);
-    // compute the unmerged sum
-    UnmergedSumFunctor<size_type, ordinal_type, alno_row_view_t_,
-                       blno_row_view_t_, clno_row_view_t_, alno_nnz_view_t_,
-                       blno_nnz_view_t_, clno_nnz_view_t_>
-        unmergedSum(nrows, a_rowmap, a_entries, b_rowmap, b_entries,
-                    c_rowmap_upperbound, c_entries_uncompressed, ab_perm);
-    Kokkos::parallel_for(
-        "KokkosSparse::SpAdd:Symbolic::InputNotSorted::UnmergedSum",
-        range_type(0, nrows), unmergedSum);
-    // sort the unmerged sum
-    KokkosKernels::sort_crs_matrix<execution_space, clno_row_view_t_,
-                                   clno_nnz_view_t_, clno_nnz_view_t_>(
-        c_rowmap_upperbound, c_entries_uncompressed, ab_perm);
-    clno_nnz_view_t_ a_pos(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing, "A entry positions"),
-        a_entries.extent(0));
-    clno_nnz_view_t_ b_pos(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing, "B entry positions"),
-        b_entries.extent(0));
-    // merge the entries and compute Apos/Bpos, as well as Crowcounts
-    {
-      MergeEntriesFunctor<size_type, ordinal_type, alno_row_view_t_,
-                          blno_row_view_t_, clno_row_view_t_, clno_nnz_view_t_>
-          mergeEntries(nrows, a_rowmap, b_rowmap, c_rowmap_upperbound, c_rowmap,
-                       c_entries_uncompressed, ab_perm, a_pos, b_pos);
-      Kokkos::parallel_for(
-          "KokkosSparse::SpAdd:Symbolic::InputNotSorted::MergeEntries",
-          range_type(0, nrows), mergeEntries);
-      // compute actual c_rowmap
-      KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<clno_row_view_t_,
-                                                            execution_space>(
-          nrows + 1, c_rowmap);
-    }
-    addHandle->set_a_b_pos(a_pos, b_pos);
-  }
-  // provide the number of NNZ in C to user through handle
-  size_type cmax;
-  Kokkos::deep_copy(cmax, Kokkos::subview(c_rowmap, nrows));
-  addHandle->set_c_nnz(cmax);
-  addHandle->set_call_symbolic();
-  addHandle->set_call_numeric(false);
-  // this fence is for accurate timing from host
-  execution_space().fence();
+  typedef typename KernelHandle::HandleExecSpace ExecSpace;
+  typedef typename KernelHandle::HandleTempMemorySpace MemSpace;
+  typedef typename KernelHandle::HandlePersistentMemorySpace PersistentMemSpace;
+  typedef typename Kokkos::Device<ExecSpace, MemSpace> DeviceType;
+
+  typedef typename KernelHandle::const_size_type c_size_t;
+  typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
+  typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
+
+  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<
+      c_size_t, c_lno_t, c_scalar_t, ExecSpace, MemSpace, PersistentMemSpace>
+      ConstKernelHandle;
+  ConstKernelHandle tmp_handle(*handle);
+
+  typedef Kokkos::View<typename alno_row_view_t_::const_value_type*,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           alno_row_view_t_>::array_layout,
+                       DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_a_rowmap;
+  typedef Kokkos::View<typename alno_nnz_view_t_::const_value_type*,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           alno_nnz_view_t_>::array_layout,
+                       DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_a_entries;
+  typedef Kokkos::View<typename blno_row_view_t_::const_value_type*,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           blno_row_view_t_>::array_layout,
+                       DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_b_rowmap;
+  typedef Kokkos::View<typename blno_nnz_view_t_::const_value_type*,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           blno_nnz_view_t_>::array_layout,
+                       DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_b_entries;
+  typedef Kokkos::View<typename clno_row_view_t_::value_type*,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           clno_row_view_t_>::array_layout,
+                       DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_c_rowmap;
+  KokkosSparse::Impl::SPADD_SYMBOLIC<ConstKernelHandle, Internal_a_rowmap,
+                                     Internal_a_entries, Internal_b_rowmap,
+                                     Internal_b_entries, Internal_c_rowmap>::
+      spadd_symbolic(&tmp_handle,
+                     Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)),
+                     Internal_a_entries(a_entries.data(), a_entries.extent(0)),
+                     Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)),
+                     Internal_b_entries(b_entries.data(), b_entries.extent(0)),
+                     Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0)));
 }
 
-template <typename size_type, typename ordinal_type, typename ArowptrsT,
-          typename BrowptrsT, typename CrowptrsT, typename AcolindsT,
-          typename BcolindsT, typename CcolindsT, typename AvaluesT,
-          typename BvaluesT, typename CvaluesT, typename AscalarT,
-          typename BscalarT>
-struct SortedNumericSumFunctor {
-  using CscalarT = typename CvaluesT::non_const_value_type;
-
-  SortedNumericSumFunctor(const ArowptrsT& Arowptrs_,
-                          const BrowptrsT& Browptrs_,
-                          const CrowptrsT& Crowptrs_,
-                          const AcolindsT& Acolinds_,
-                          const BcolindsT& Bcolinds_,
-                          const CcolindsT& Ccolinds_, const AvaluesT& Avalues_,
-                          const BvaluesT& Bvalues_, const CvaluesT& Cvalues_,
-                          const AscalarT alpha_, const BscalarT beta_)
-      : Arowptrs(Arowptrs_),
-        Browptrs(Browptrs_),
-        Crowptrs(Crowptrs_),
-        Acolinds(Acolinds_),
-        Bcolinds(Bcolinds_),
-        Ccolinds(Ccolinds_),
-        Avalues(Avalues_),
-        Bvalues(Bvalues_),
-        Cvalues(Cvalues_),
-        alpha(alpha_),
-        beta(beta_) {}
-
-  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const {
-    const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits<ordinal_type>::max();
-
-    // count the union of nonzeros in Arow and Brow
-    size_type ai        = 0;
-    size_type bi        = 0;
-    size_type Arowstart = Arowptrs(i);
-    size_type Arowlen   = Arowptrs(i + 1) - Arowstart;
-    size_type Browstart = Browptrs(i);
-    size_type Browlen   = Browptrs(i + 1) - Browstart;
-    ordinal_type Acol   = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart);
-    ordinal_type Bcol   = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart);
-    size_type Coffset   = Crowptrs(i);
-    while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) {
-      ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol;
-      // Eat all entries in both A and B which have this column
-      // This also results in Acol/Bcol being updated to following entries for
-      // next loop iter
-      CscalarT accum = Kokkos::ArithTraits<CscalarT>::zero();
-      while (Acol == Ccol) {
-        accum += static_cast<CscalarT>(alpha * Avalues(Arowstart + ai));
-        ai++;
-        if (ai == Arowlen)
-          Acol = ORDINAL_MAX;
-        else
-          Acol = Acolinds(Arowstart + ai);
-      }
-      while (Bcol == Ccol) {
-        accum += static_cast<CscalarT>(beta * Bvalues(Browstart + bi));
-        bi++;
-        if (bi == Browlen)
-          Bcol = ORDINAL_MAX;
-        else
-          Bcol = Bcolinds(Browstart + bi);
-      }
-      Ccolinds(Coffset) = Ccol;
-      Cvalues(Coffset)  = accum;
-      Coffset++;
-    }
-  }
-
-  const ArowptrsT Arowptrs;
-  const BrowptrsT Browptrs;
-  const CrowptrsT Crowptrs;
-  const AcolindsT Acolinds;
-  const BcolindsT Bcolinds;
-  CcolindsT Ccolinds;
-  const AvaluesT Avalues;
-  const BvaluesT Bvalues;
-  CvaluesT Cvalues;
-  const AscalarT alpha;
-  const BscalarT beta;
-};
-
-template <typename size_type, typename ordinal_type, typename ArowptrsT,
-          typename BrowptrsT, typename CrowptrsT, typename AcolindsT,
-          typename BcolindsT, typename CcolindsT, typename AvaluesT,
-          typename BvaluesT, typename CvaluesT, typename AscalarT,
-          typename BscalarT>
-struct UnsortedNumericSumFunctor {
-  using CscalarT = typename CvaluesT::non_const_value_type;
-
-  UnsortedNumericSumFunctor(
-      const ArowptrsT Arowptrs_, const BrowptrsT Browptrs_,
-      const CrowptrsT Crowptrs_, const AcolindsT Acolinds_,
-      const BcolindsT Bcolinds_, CcolindsT Ccolinds_, const AvaluesT Avalues_,
-      const BvaluesT Bvalues_, CvaluesT Cvalues_, const AscalarT alpha_,
-      const BscalarT beta_, const CcolindsT Apos_, const CcolindsT Bpos_)
-      : Arowptrs(Arowptrs_),
-        Browptrs(Browptrs_),
-        Crowptrs(Crowptrs_),
-        Acolinds(Acolinds_),
-        Bcolinds(Bcolinds_),
-        Ccolinds(Ccolinds_),
-        Avalues(Avalues_),
-        Bvalues(Bvalues_),
-        Cvalues(Cvalues_),
-        alpha(alpha_),
-        beta(beta_),
-        Apos(Apos_),
-        Bpos(Bpos_) {}
-
-  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const {
-    size_type CrowStart = Crowptrs(i);
-    size_type CrowEnd   = Crowptrs(i + 1);
-    size_type ArowStart = Arowptrs(i);
-    size_type ArowEnd   = Arowptrs(i + 1);
-    size_type BrowStart = Browptrs(i);
-    size_type BrowEnd   = Browptrs(i + 1);
-    for (size_type j = CrowStart; j < CrowEnd; j++)
-      Cvalues(j) = Kokkos::ArithTraits<CscalarT>::zero();
-    // add in A entries, while setting C colinds
-    for (size_type j = ArowStart; j < ArowEnd; j++) {
-      Cvalues(CrowStart + Apos(j)) += alpha * Avalues(j);
-      Ccolinds(CrowStart + Apos(j)) = Acolinds(j);
-    }
-    // add in B entries, while setting C colinds
-    for (size_type j = BrowStart; j < BrowEnd; j++) {
-      Cvalues(CrowStart + Bpos(j)) += beta * Bvalues(j);
-      Ccolinds(CrowStart + Bpos(j)) = Bcolinds(j);
-    }
-  }
-  const ArowptrsT Arowptrs;
-  const BrowptrsT Browptrs;
-  const CrowptrsT Crowptrs;
-  const AcolindsT Acolinds;
-  const BcolindsT Bcolinds;
-  CcolindsT Ccolinds;
-  const AvaluesT Avalues;
-  const BvaluesT Bvalues;
-  CvaluesT Cvalues;
-  const AscalarT alpha;
-  const BscalarT beta;
-  const CcolindsT Apos;
-  const CcolindsT Bpos;
-};
-
 template <typename KernelHandle, typename alno_row_view_t_,
           typename alno_nnz_view_t_, typename ascalar_t_,
           typename ascalar_nnz_view_t_, typename blno_row_view_t_,
           typename blno_nnz_view_t_, typename bscalar_t_,
           typename bscalar_nnz_view_t_, typename clno_row_view_t_,
           typename clno_nnz_view_t_, typename cscalar_nnz_view_t_>
-void spadd_numeric(KernelHandle* kernel_handle, const alno_row_view_t_ a_rowmap,
+void spadd_numeric(KernelHandle* handle, const alno_row_view_t_ a_rowmap,
                    const alno_nnz_view_t_ a_entries,
                    const ascalar_nnz_view_t_ a_values, const ascalar_t_ alpha,
                    const blno_row_view_t_ b_rowmap,
@@ -802,89 +130,81 @@ void spadd_numeric(KernelHandle* kernel_handle, const alno_row_view_t_ a_rowmap,
                    const bscalar_nnz_view_t_ b_values, const bscalar_t_ beta,
                    const clno_row_view_t_ c_rowmap, clno_nnz_view_t_ c_entries,
                    cscalar_nnz_view_t_ c_values) {
-  typedef typename KernelHandle::size_type size_type;
-  typedef typename KernelHandle::nnz_lno_t ordinal_type;
-  typedef typename KernelHandle::nnz_scalar_t scalar_type;
-  typedef
-      typename KernelHandle::SPADDHandleType::execution_space execution_space;
-  // Check that A/B/C data types match KernelHandle types, and that C data types
-  // are nonconst (doesn't matter if A/B types are const)
-  static_assert(SAME_TYPE(ascalar_t_, scalar_type),
-                "A scalar type must match handle scalar type");
-  static_assert(SAME_TYPE(bscalar_t_, scalar_type),
-                "B scalar type must match handle scalar type");
-  static_assert(SAME_TYPE(typename alno_row_view_t_::value_type, size_type),
-                "add_symbolic: A size_type must match KernelHandle size_type "
-                "(const doesn't matter)");
-  static_assert(SAME_TYPE(typename blno_row_view_t_::value_type, size_type),
-                "add_symbolic: B size_type must match KernelHandle size_type "
-                "(const doesn't matter)");
-  static_assert(
-      SAME_TYPE(typename clno_row_view_t_::non_const_value_type, size_type),
-      "add_symbolic: C size_type must match KernelHandle size_type)");
-  static_assert(SAME_TYPE(typename alno_nnz_view_t_::value_type, ordinal_type),
-                "add_symbolic: A entry type must match KernelHandle entry type "
-                "(aka nnz_lno_t, and const doesn't matter)");
-  static_assert(SAME_TYPE(typename blno_nnz_view_t_::value_type, ordinal_type),
-                "add_symbolic: B entry type must match KernelHandle entry type "
-                "(aka nnz_lno_t, and const doesn't matter)");
-  static_assert(SAME_TYPE(typename clno_nnz_view_t_::value_type, ordinal_type),
-                "add_symbolic: C entry type must match KernelHandle entry type "
-                "(aka nnz_lno_t)");
-  static_assert(std::is_same<typename clno_nnz_view_t_::non_const_value_type,
-                             typename clno_nnz_view_t_::value_type>::value,
-                "add_symbolic: C entry type must not be const");
-  static_assert(
-      SAME_TYPE(typename ascalar_nnz_view_t_::value_type, scalar_type),
-      "add_symbolic: A scalar type must match KernelHandle entry type (aka "
-      "nnz_lno_t, and const doesn't matter)");
-  static_assert(
-      SAME_TYPE(typename bscalar_nnz_view_t_::value_type, scalar_type),
-      "add_symbolic: B scalar type must match KernelHandle entry type (aka "
-      "nnz_lno_t, and const doesn't matter)");
-  static_assert(
-      SAME_TYPE(typename cscalar_nnz_view_t_::value_type, scalar_type),
-      "add_symbolic: C scalar type must match KernelHandle entry type (aka "
-      "nnz_lno_t)");
-  static_assert(std::is_same<typename cscalar_nnz_view_t_::non_const_value_type,
-                             typename cscalar_nnz_view_t_::value_type>::value,
-                "add_symbolic: C scalar type must not be const");
-  typedef Kokkos::RangePolicy<execution_space, size_type> range_type;
-  auto addHandle = kernel_handle->get_spadd_handle();
-  // rowmap length can be 0 or 1 if #rows is 0.
-  // Otherwise, it's always #rows+1.
-  if (a_rowmap.extent(0) == 0 || a_rowmap.extent(0) == 1) {
-    addHandle->set_call_numeric();
-    return;
-  }
-  ordinal_type nrows = a_rowmap.extent(0) - 1;
-  if (addHandle->is_input_sorted()) {
-    SortedNumericSumFunctor<
-        size_type, ordinal_type, alno_row_view_t_, blno_row_view_t_,
-        clno_row_view_t_, alno_nnz_view_t_, blno_nnz_view_t_, clno_nnz_view_t_,
-        ascalar_nnz_view_t_, bscalar_nnz_view_t_, cscalar_nnz_view_t_,
-        ascalar_t_, bscalar_t_>
-        sortedNumeric(a_rowmap, b_rowmap, c_rowmap, a_entries, b_entries,
-                      c_entries, a_values, b_values, c_values, alpha, beta);
-    Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputSorted",
-                         range_type(0, nrows), sortedNumeric);
-  } else {
-    // use a_pos and b_pos (set in the handle by symbolic) to quickly compute C
-    // entries and values
-    UnsortedNumericSumFunctor<
-        size_type, ordinal_type, alno_row_view_t_, blno_row_view_t_,
-        clno_row_view_t_, alno_nnz_view_t_, blno_nnz_view_t_, clno_nnz_view_t_,
-        ascalar_nnz_view_t_, bscalar_nnz_view_t_, cscalar_nnz_view_t_,
-        ascalar_t_, bscalar_t_>
-        unsortedNumeric(a_rowmap, b_rowmap, c_rowmap, a_entries, b_entries,
-                        c_entries, a_values, b_values, c_values, alpha, beta,
-                        addHandle->get_a_pos(), addHandle->get_b_pos());
-    Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputNotSorted",
-                         range_type(0, nrows), unsortedNumeric);
-  }
-  addHandle->set_call_numeric();
-  // this fence is for accurate timing from host
-  execution_space().fence();
+  typedef typename KernelHandle::HandleExecSpace ExecSpace;
+  typedef typename KernelHandle::HandleTempMemorySpace MemSpace;
+  typedef typename KernelHandle::HandlePersistentMemorySpace PersistentMemSpace;
+  typedef typename Kokkos::Device<ExecSpace, MemSpace> DeviceType;
+
+  typedef typename KernelHandle::const_size_type c_size_t;
+  typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
+  typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
+
+  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<
+      c_size_t, c_lno_t, c_scalar_t, ExecSpace, MemSpace, PersistentMemSpace>
+      ConstKernelHandle;
+  ConstKernelHandle tmp_handle(*handle);
+
+  typedef Kokkos::View<typename alno_row_view_t_::const_value_type*,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           alno_row_view_t_>::array_layout,
+                       DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_a_rowmap;
+  typedef Kokkos::View<typename alno_nnz_view_t_::const_value_type*,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           alno_nnz_view_t_>::array_layout,
+                       DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_a_entries;
+  typedef Kokkos::View<typename ascalar_nnz_view_t_::const_value_type*,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           ascalar_nnz_view_t_>::array_layout,
+                       DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_a_values;
+  typedef Kokkos::View<typename blno_row_view_t_::const_value_type*,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           blno_row_view_t_>::array_layout,
+                       DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_b_rowmap;
+  typedef Kokkos::View<typename blno_nnz_view_t_::const_value_type*,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           blno_nnz_view_t_>::array_layout,
+                       DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_b_entries;
+  typedef Kokkos::View<typename bscalar_nnz_view_t_::const_value_type*,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           bscalar_nnz_view_t_>::array_layout,
+                       DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_b_values;
+  typedef Kokkos::View<typename clno_row_view_t_::const_value_type*,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           clno_row_view_t_>::array_layout,
+                       DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_c_rowmap;
+  typedef Kokkos::View<typename clno_nnz_view_t_::value_type*,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           clno_nnz_view_t_>::array_layout,
+                       DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_c_entries;
+  typedef Kokkos::View<typename cscalar_nnz_view_t_::value_type*,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           cscalar_nnz_view_t_>::array_layout,
+                       DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_c_values;
+  KokkosSparse::Impl::SPADD_NUMERIC<ConstKernelHandle, Internal_a_rowmap,
+                                    Internal_a_entries, Internal_a_values,
+                                    Internal_b_rowmap, Internal_b_entries,
+                                    Internal_b_values, Internal_c_rowmap,
+                                    Internal_c_entries, Internal_c_values>::
+      spadd_numeric(&tmp_handle, alpha,
+                    Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)),
+                    Internal_a_entries(a_entries.data(), a_entries.extent(0)),
+                    Internal_a_values(a_values.data(), a_values.extent(0)),
+                    beta,
+                    Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)),
+                    Internal_b_entries(b_entries.data(), b_entries.extent(0)),
+                    Internal_b_values(b_values.data(), b_values.extent(0)),
+                    Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0)),
+                    Internal_c_entries(c_entries.data(), c_entries.extent(0)),
+                    Internal_c_values(c_values.data(), c_values.extent(0)));
 }
 }  // namespace Experimental
 
diff --git a/src/sparse/KokkosSparse_spgemm.hpp b/src/sparse/KokkosSparse_spgemm.hpp
index bdf4d0da75..0cee2979a2 100644
--- a/src/sparse/KokkosSparse_spgemm.hpp
+++ b/src/sparse/KokkosSparse_spgemm.hpp
@@ -81,6 +81,47 @@ void spgemm_symbolic(KernelHandle& kh, const AMatrix& A, const bool Amode,
               entriesC);
 }
 
+// Symbolic phase for block SpGEMM (BSR matrices)
+template <class KernelHandle, class AMatrixType, class BMatrixType,
+          class CMatrixType>
+void block_spgemm_symbolic(KernelHandle& kh, const AMatrixType& A,
+                           const bool transposeA, const BMatrixType& B,
+                           const bool transposeB, CMatrixType& C) {
+  using row_map_type = typename CMatrixType::row_map_type::non_const_type;
+  using entries_type = typename CMatrixType::index_type::non_const_type;
+  using values_type  = typename CMatrixType::values_type::non_const_type;
+
+  auto blockDim = A.blockDim();
+  if (blockDim != B.blockDim()) {
+    throw std::invalid_argument(
+        "Block SpGEMM must be called for matrices with the same block size");
+  }
+
+  row_map_type row_mapC(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "non_const_lnow_row"),
+      A.numRows() + 1);
+
+  KokkosSparse::Experimental::spgemm_symbolic(
+      &kh, A.numRows(), B.numRows(), B.numCols(), A.graph.row_map,
+      A.graph.entries, transposeA, B.graph.row_map, B.graph.entries, transposeB,
+      row_mapC);
+
+  entries_type entriesC;
+  values_type valuesC;
+  const size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
+  if (c_nnz_size) {
+    entriesC = entries_type(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"),
+        c_nnz_size);
+    valuesC =
+        values_type(Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"),
+                    c_nnz_size * blockDim * blockDim);
+  }
+
+  C = CMatrixType("C=AB", A.numRows(), B.numCols(), c_nnz_size, valuesC,
+                  row_mapC, entriesC, blockDim);
+}
+
 template <class KernelHandle, class AMatrix, class BMatrix, class CMatrix>
 void spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode,
                     const BMatrix& B, const bool Bmode, CMatrix& C) {
@@ -94,6 +135,21 @@ void spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode,
       B.values, Bmode, C.graph.row_map, C.graph.entries, C.values);
 }
 
+template <class KernelHandle, class AMatrix, class BMatrix, class CMatrix>
+void block_spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode,
+                          const BMatrix& B, const bool Bmode, CMatrix& C) {
+  auto blockDim = A.blockDim();
+  if (blockDim != B.blockDim() or blockDim != C.blockDim()) {
+    throw std::invalid_argument(
+        "Block SpGEMM must be called for matrices with the same block size");
+  }
+
+  KokkosSparse::Experimental::spgemm_numeric(
+      &kh, A.numRows(), B.numRows(), B.numCols(), A.graph.row_map,
+      A.graph.entries, A.values, Amode, B.graph.row_map, B.graph.entries,
+      B.values, Bmode, C.graph.row_map, C.graph.entries, C.values, blockDim);
+}
+
 }  // namespace KokkosSparse
 
 #endif
diff --git a/src/sparse/KokkosSparse_spgemm_numeric.hpp b/src/sparse/KokkosSparse_spgemm_numeric.hpp
index 60a54f5b8b..313922dc62 100644
--- a/src/sparse/KokkosSparse_spgemm_numeric.hpp
+++ b/src/sparse/KokkosSparse_spgemm_numeric.hpp
@@ -46,11 +46,18 @@
 
 #include "KokkosKernels_helpers.hpp"
 #include "KokkosSparse_spgemm_numeric_spec.hpp"
+#include "KokkosSparse_bspgemm_numeric_spec.hpp"
 
 namespace KokkosSparse {
 
 namespace Experimental {
 
+//
+// NOTE: block_dim = 1 for CRS-formated views
+//       block_dim >= 1 for BSR-formatted views (bs=1 BSR is CRS)
+//
+// NOTE: Block CRS format is not yet supported !
+//
 template <typename KernelHandle, typename alno_row_view_t_,
           typename alno_nnz_view_t_, typename ascalar_nnz_view_t_,
           typename blno_row_view_t_, typename blno_nnz_view_t_,
@@ -66,7 +73,9 @@ void spgemm_numeric(KernelHandle *handle,
                     bool transposeA, blno_row_view_t_ row_mapB,
                     blno_nnz_view_t_ entriesB, bscalar_nnz_view_t_ valuesB,
                     bool transposeB, clno_row_view_t_ row_mapC,
-                    clno_nnz_view_t_ &entriesC, cscalar_nnz_view_t_ &valuesC) {
+                    clno_nnz_view_t_ &entriesC, cscalar_nnz_view_t_ &valuesC,
+
+                    typename KernelHandle::const_nnz_lno_t block_dim = 1) {
   static_assert(
       std::is_same<typename clno_nnz_view_t_::value_type,
                    typename clno_nnz_view_t_::non_const_value_type>::value,
@@ -139,7 +148,9 @@ void spgemm_numeric(KernelHandle *handle,
         "If you need this case please let kokkos-kernels developers know.\n");
   }
 
-  if (m < 1 || n < 1 || k < 1) return;
+  if (m < 1 || n < 1 || k < 1 || entriesA.extent(0) < 1 ||
+      entriesB.extent(0) < 1)
+    return;
 
   typedef typename KernelHandle::const_size_type c_size_t;
   typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
@@ -240,6 +251,23 @@ void spgemm_numeric(KernelHandle *handle,
   Internal_clno_nnz_view_t_ nonconst_c_l(entriesC.data(), entriesC.extent(0));
   Internal_cscalar_nnz_view_t_ nonconst_c_s(valuesC.data(), valuesC.extent(0));
 
+  if (block_dim > 1) {
+    KokkosSparse::Impl::BSPGEMM_NUMERIC<
+        const_handle_type, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_,
+        Internal_ascalar_nnz_view_t_, Internal_blno_row_view_t_,
+        Internal_blno_nnz_view_t_, Internal_bscalar_nnz_view_t_,
+        Internal_clno_row_view_t_, Internal_clno_nnz_view_t_,
+        Internal_cscalar_nnz_view_t_>::bspgemm_numeric(&tmp_handle, m, n, k,
+                                                       block_dim, const_a_r,
+                                                       const_a_l, const_a_s,
+                                                       transposeA, const_b_r,
+                                                       const_b_l, const_b_s,
+                                                       transposeB, nonconst_c_r,
+                                                       nonconst_c_l,
+                                                       nonconst_c_s);
+    return;
+  }
+
   KokkosSparse::Impl::SPGEMM_NUMERIC<
       const_handle_type,  // KernelHandle,
       Internal_alno_row_view_t_, Internal_alno_nnz_view_t_,
diff --git a/src/sparse/KokkosSparse_spiluk_handle.hpp b/src/sparse/KokkosSparse_spiluk_handle.hpp
index 3cabcd0f73..54cc124474 100644
--- a/src/sparse/KokkosSparse_spiluk_handle.hpp
+++ b/src/sparse/KokkosSparse_spiluk_handle.hpp
@@ -45,6 +45,7 @@
 #include <Kokkos_Core.hpp>
 #include <iostream>
 #include <string>
+#include <KokkosKernels_HashmapAccumulator.hpp>
 
 #ifndef _SPILUKHANDLE_HPP
 #define _SPILUKHANDLE_HPP
@@ -87,6 +88,12 @@ class SPILUKHandle {
   typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace>
       nnz_lno_view_t;
 
+  typedef typename Kokkos::View<size_type *, Kokkos::HostSpace>
+      nnz_row_view_host_t;
+
+  typedef typename Kokkos::View<nnz_lno_t *, Kokkos::HostSpace>
+      nnz_lno_view_host_t;
+
   typedef typename std::make_signed<
       typename nnz_row_view_t::non_const_value_type>::type signed_integral_t;
   typedef Kokkos::View<signed_integral_t *,
@@ -95,14 +102,19 @@ class SPILUKHandle {
                        typename nnz_row_view_t::memory_traits>
       signed_nnz_lno_view_t;
 
+  typedef Kokkos::View<nnz_lno_t **, Kokkos::LayoutRight,
+                       HandlePersistentMemorySpace>
+      work_view_t;
+
  private:
   nnz_row_view_t level_list;  // level IDs which the rows belong to
   nnz_lno_view_t level_idx;   // the list of rows in each level
   nnz_lno_view_t
       level_ptr;  // the starting index (into the view level_idx) of each level
-  nnz_lno_view_t level_nchunks;  // number of chunks of rows at each level
-  nnz_lno_view_t
+  nnz_lno_view_host_t level_nchunks;  // number of chunks of rows at each level
+  nnz_lno_view_host_t
       level_nrowsperchunk;  // maximum number of rows among chunks at each level
+  work_view_t iw;  // working view for mapping dense indices to sparse indices
 
   size_type nrows;
   size_type nlevels;
@@ -128,6 +140,7 @@ class SPILUKHandle {
         level_ptr(),
         level_nchunks(),
         level_nrowsperchunk(),
+        iw(),
         nrows(nrows_),
         nlevels(0),
         nnzL(nnzL_),
@@ -147,11 +160,12 @@ class SPILUKHandle {
     set_nnzU(nnzU_);
     set_level_maxrows(0);
     set_level_maxrowsperchunk(0);
-    level_list    = nnz_row_view_t("level_list", nrows_),
-    level_idx     = nnz_lno_view_t("level_idx", nrows_),
-    level_ptr     = nnz_lno_view_t("level_ptr", nrows_ + 1),
-    level_nchunks = nnz_lno_view_t(), level_nrowsperchunk = nnz_lno_view_t(),
-    reset_symbolic_complete();
+    level_list          = nnz_row_view_t("level_list", nrows_),
+    level_idx           = nnz_lno_view_t("level_idx", nrows_),
+    level_ptr           = nnz_lno_view_t("level_ptr", nrows_ + 1),
+    level_nchunks       = nnz_lno_view_host_t(),
+    level_nrowsperchunk = nnz_lno_view_host_t(), reset_symbolic_complete(),
+    iw                  = work_view_t();
   }
 
   virtual ~SPILUKHandle(){};
@@ -170,17 +184,28 @@ class SPILUKHandle {
   nnz_lno_view_t get_level_ptr() const { return level_ptr; }
 
   KOKKOS_INLINE_FUNCTION
-  nnz_lno_view_t get_level_nchunks() const { return level_nchunks; }
+  nnz_lno_view_host_t get_level_nchunks() const { return level_nchunks; }
 
   void alloc_level_nchunks(const size_type nlevels_) {
-    level_nchunks = nnz_lno_view_t("level_nchunks", nlevels_);
+    level_nchunks = nnz_lno_view_host_t("level_nchunks", nlevels_);
   }
 
   KOKKOS_INLINE_FUNCTION
-  nnz_lno_view_t get_level_nrowsperchunk() const { return level_nrowsperchunk; }
+  nnz_lno_view_host_t get_level_nrowsperchunk() const {
+    return level_nrowsperchunk;
+  }
 
   void alloc_level_nrowsperchunk(const size_type nlevels_) {
-    level_nrowsperchunk = nnz_lno_view_t("level_nrowsperchunk", nlevels_);
+    level_nrowsperchunk = nnz_lno_view_host_t("level_nrowsperchunk", nlevels_);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  work_view_t get_iw() const { return iw; }
+
+  void alloc_iw(const size_type nrows_, const size_type ncols_) {
+    iw = work_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"),
+                     nrows_, ncols_);
+    Kokkos::deep_copy(iw, nnz_lno_t(-1));
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -238,8 +263,7 @@ class SPILUKHandle {
     if (algm == SPILUKAlgorithm::SEQLVLSCHD_TP1)
       std::cout << "SEQLVLSCHD_TP1" << std::endl;
 
-    /*
-    if ( algm == SPILUKAlgorithm::SEQLVLSCHED_TP2 ) {
+    /*if ( algm == SPILUKAlgorithm::SEQLVLSCHED_TP2 ) {
       std::cout << "SEQLVLSCHED_TP2" << std::endl;;
       std::cout << "WARNING: With CUDA this is currently only reliable with
     int-int ordinal-offset pair" << std::endl;
diff --git a/src/sparse/KokkosSparse_spmv.hpp b/src/sparse/KokkosSparse_spmv.hpp
index 8ec7799e16..95860029f1 100644
--- a/src/sparse/KokkosSparse_spmv.hpp
+++ b/src/sparse/KokkosSparse_spmv.hpp
@@ -662,9 +662,10 @@ template <class AlphaType, class AMatrix, class XVector, class BetaType,
           typename std::enable_if<
               KokkosSparse::is_crs_matrix<AMatrix>::value>::type* = nullptr>
 #endif
-void spmv(KokkosKernels::Experimental::Controls /*controls*/, const char mode[],
+void spmv(KokkosKernels::Experimental::Controls controls, const char mode[],
           const AlphaType& alpha, const AMatrix& A, const XVector& x,
           const BetaType& beta, const YVector& y, const RANK_TWO) {
+
   // Make sure that x and y have the same rank.
   static_assert(
       static_cast<int>(XVector::rank) == static_cast<int>(YVector::rank),
@@ -752,21 +753,50 @@ void spmv(KokkosKernels::Experimental::Controls /*controls*/, const char mode[],
     XVector_Internal x_i = x;
     YVector_Internal y_i = y;
 
-    return Impl::SPMV_MV<
-        typename AMatrix_Internal::value_type,
-        typename AMatrix_Internal::ordinal_type,
-        typename AMatrix_Internal::device_type,
-        typename AMatrix_Internal::memory_traits,
-        typename AMatrix_Internal::size_type,
-        typename XVector_Internal::value_type**,
-        typename XVector_Internal::array_layout,
-        typename XVector_Internal::device_type,
-        typename XVector_Internal::memory_traits,
-        typename YVector_Internal::value_type**,
-        typename YVector_Internal::array_layout,
-        typename YVector_Internal::device_type,
-        typename YVector_Internal::memory_traits>::spmv_mv(mode, alpha, A_i,
-                                                           x_i, beta, y_i);
+    bool useNative = false;
+
+// cusparseSpMM does not support conjugate mode
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+    useNative = useNative || (Conjugate[0] == mode[0]);
+#endif
+    useNative = useNative || (controls.isParameter("algorithm") &&
+                              (controls.getParameter("algorithm") == "native"));
+
+    if (useNative) {
+      return Impl::SPMV_MV<
+          typename AMatrix_Internal::value_type,
+          typename AMatrix_Internal::ordinal_type,
+          typename AMatrix_Internal::device_type,
+          typename AMatrix_Internal::memory_traits,
+          typename AMatrix_Internal::size_type,
+          typename XVector_Internal::value_type**,
+          typename XVector_Internal::array_layout,
+          typename XVector_Internal::device_type,
+          typename XVector_Internal::memory_traits,
+          typename YVector_Internal::value_type**,
+          typename YVector_Internal::array_layout,
+          typename YVector_Internal::device_type,
+          typename YVector_Internal::memory_traits,
+          std::is_integral<typename AMatrix_Internal::value_type>::value,
+          false>::spmv_mv(controls, mode, alpha, A_i, x_i, beta, y_i);
+    } else {
+      return Impl::SPMV_MV<
+          typename AMatrix_Internal::value_type,
+          typename AMatrix_Internal::ordinal_type,
+          typename AMatrix_Internal::device_type,
+          typename AMatrix_Internal::memory_traits,
+          typename AMatrix_Internal::size_type,
+          typename XVector_Internal::value_type**,
+          typename XVector_Internal::array_layout,
+          typename XVector_Internal::device_type,
+          typename XVector_Internal::memory_traits,
+          typename YVector_Internal::value_type**,
+          typename YVector_Internal::array_layout,
+          typename YVector_Internal::device_type,
+          typename YVector_Internal::memory_traits>::spmv_mv(controls, mode,
+                                                             alpha, A_i, x_i,
+                                                             beta, y_i);
+    }
   }
 }
 
@@ -894,8 +924,10 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[],
   //
   // Whether to call KokkosKernel's native implementation, even if a TPL impl is
   // available
-  bool useFallback = controls.isParameter("algorithm") &&
-                     controls.getParameter("algorithm") == "native";
+  bool useFallback =
+      controls.isParameter("algorithm") &&
+      (controls.getParameter("algorithm") == "native" ||
+       controls.getParameter("algorithm") == "experimental_bsr_tc");
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
   // cuSPARSE does not support the modes (C), (T), (H)
@@ -936,6 +968,7 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[],
         typename YVector_Internal::array_layout,
         typename YVector_Internal::device_type,
         typename YVector_Internal::memory_traits,
+        std::is_integral<typename AMatrix_Internal::const_value_type>::value,
         false>::spmv_mv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i);
     Kokkos::Profiling::popRegion();
   } else {
@@ -952,11 +985,9 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[],
         typename YVector_Internal::value_type**,
         typename YVector_Internal::array_layout,
         typename YVector_Internal::device_type,
-        typename YVector_Internal::memory_traits>::spmv_mv_bsrmatrix(controls,
-                                                                     mode,
-                                                                     alpha, A_i,
-                                                                     x_i, beta,
-                                                                     y_i);
+        typename YVector_Internal::memory_traits,
+        std::is_integral<typename AMatrix_Internal::const_value_type>::value>::
+        spmv_mv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i);
   }
 }
 
@@ -1072,12 +1103,12 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[],
   }
   //
   return Experimental::Impl::SPMV_MV_BLOCKCRSMATRIX<
-      typename AMatrix_Internal::value_type,
-      typename AMatrix_Internal::ordinal_type,
+      typename AMatrix_Internal::const_value_type,
+      typename AMatrix_Internal::const_ordinal_type,
       typename AMatrix_Internal::device_type,
       typename AMatrix_Internal::memory_traits,
-      typename AMatrix_Internal::size_type,
-      typename XVector_Internal::value_type**,
+      typename AMatrix_Internal::const_size_type,
+      typename XVector_Internal::const_value_type**,
       typename XVector_Internal::array_layout,
       typename XVector_Internal::device_type,
       typename XVector_Internal::memory_traits,
@@ -1097,7 +1128,7 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[],
 /// entries of y; if alpha == 0, ignore the entries of A and x.
 ///
 /// If \c AMatrix is a KokkosSparse::Experimental::BsrMatrix, controls may have
-/// \c "algorithm" = \c "experimental_tc_bsr" to use Nvidia tensor cores on
+/// \c "algorithm" = \c "experimental_bsr_tc" to use Nvidia tensor cores on
 /// Volta or Ampere architectures. On Volta-architecture GPUs the only available
 /// precision is mixed-precision fp32 accumulator from fp16 inputs. On
 /// Ampere-architecture GPUs (cc >= 80), mixed precision is used when A is fp16,
@@ -1530,8 +1561,9 @@ void spmv_struct(const char mode[], const int stencil_type,
         typename YVector_Internal::value_type**,
         typename YVector_Internal::array_layout,
         typename YVector_Internal::device_type,
-        typename YVector_Internal::memory_traits>::spmv_mv(mode, alpha, A_i,
-                                                           x_i, beta, y_i);
+        typename YVector_Internal::memory_traits>::
+        spmv_mv(KokkosKernels::Experimental::Controls(), mode, alpha, A_i, x_i,
+                beta, y_i);
   }
 }
 
diff --git a/src/sparse/KokkosSparse_sptrsv_cholmod.hpp b/src/sparse/KokkosSparse_sptrsv_cholmod.hpp
index 796ee579bd..6d354047cf 100644
--- a/src/sparse/KokkosSparse_sptrsv_cholmod.hpp
+++ b/src/sparse/KokkosSparse_sptrsv_cholmod.hpp
@@ -56,7 +56,7 @@
     defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
 
 #include "cholmod.h"
-#include "KokkosKernels_SparseUtils.hpp"
+#include "KokkosSparse_Utils.hpp"
 #include "KokkosSparse_sptrsv_supernode.hpp"
 
 namespace KokkosSparse {
diff --git a/src/sparse/KokkosSparse_sptrsv_supernode.hpp b/src/sparse/KokkosSparse_sptrsv_supernode.hpp
index fa9a607be7..481bd2cc0a 100644
--- a/src/sparse/KokkosSparse_sptrsv_supernode.hpp
+++ b/src/sparse/KokkosSparse_sptrsv_supernode.hpp
@@ -63,7 +63,7 @@
 #include "KokkosBatched_Trmm_Decl.hpp"
 #include "KokkosBatched_Trmm_Serial_Impl.hpp"
 
-#include "KokkosKernels_Sorting.hpp"
+#include "KokkosSparse_SortCrs.hpp"
 #include "KokkosSparse_sptrsv.hpp"
 
 namespace KokkosSparse {
@@ -597,8 +597,8 @@ host_graph_t generate_supernodal_graph(bool col_major, graph_t &graph,
 #endif
 
   // sort column ids per row
-  KokkosKernels::sort_crs_graph<Kokkos::HostSpace::execution_space,
-                                row_map_view_host_t, cols_view_host_t>(hr, hc);
+  KokkosSparse::sort_crs_graph<Kokkos::HostSpace::execution_space,
+                               row_map_view_host_t, cols_view_host_t>(hr, hc);
 #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
   time_seconds = timer.seconds();
   std::cout << "   > Generate Supernodal Graph: sort graph     : "
diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp
new file mode 100644
index 0000000000..7b003229ab
--- /dev/null
+++ b/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp
@@ -0,0 +1,198 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef _KOKKOSBSPGEMMIMPL_HPP
+#define _KOKKOSBSPGEMMIMPL_HPP
+
+#include "KokkosSparse_spgemm_impl.hpp"
+
+namespace KokkosSparse {
+
+namespace Impl {
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+class KokkosBSPGEMM
+    : public KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                          a_scalar_nnz_view_t_, b_lno_row_view_t_,
+                          b_lno_nnz_view_t_, b_scalar_nnz_view_t_> {
+ public:
+  using Base = KokkosSparse::Impl::KokkosSPGEMM<
+      HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
+      b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>;
+
+#define USE_BASE_TYPE(type) using type = typename Base::type;
+
+  USE_BASE_TYPE(nnz_lno_t)
+  USE_BASE_TYPE(scalar_t)
+  USE_BASE_TYPE(size_type)
+  USE_BASE_TYPE(const_a_lno_row_view_t)
+  USE_BASE_TYPE(const_a_lno_nnz_view_t)
+  USE_BASE_TYPE(const_a_scalar_nnz_view_t)
+  USE_BASE_TYPE(const_b_lno_row_view_t)
+  USE_BASE_TYPE(const_b_lno_nnz_view_t)
+  USE_BASE_TYPE(const_b_scalar_nnz_view_t)
+  USE_BASE_TYPE(row_lno_persistent_work_view_t)
+  USE_BASE_TYPE(nnz_lno_temp_work_view_t)
+  USE_BASE_TYPE(team_member_t)
+
+  USE_BASE_TYPE(MyExecSpace)
+  USE_BASE_TYPE(MyTempMemorySpace)
+  USE_BASE_TYPE(MultiCoreTag)
+  USE_BASE_TYPE(MultiCoreTag4)
+  USE_BASE_TYPE(GPUTag)
+  USE_BASE_TYPE(GPUTag4)
+  USE_BASE_TYPE(GPUTag6)
+  USE_BASE_TYPE(gpu_team_policy_t)
+  USE_BASE_TYPE(gpu_team_policy4_t)
+  USE_BASE_TYPE(gpu_team_policy6_t)
+  USE_BASE_TYPE(dynamic_multicore_team_policy_t)
+  USE_BASE_TYPE(dynamic_multicore_team_policy4_t)
+  USE_BASE_TYPE(multicore_team_policy_t)
+  USE_BASE_TYPE(multicore_team_policy4_t)
+
+ public:
+  //////////////////////////////////////////////////////////////////////////
+  /////BELOW CODE IS TO for SPEED SPGEMM
+  ////DECL IS AT _speed.hpp
+  //////////////////////////////////////////////////////////////////////////
+  template <typename a_row_view_t, typename a_nnz_view_t,
+            typename a_scalar_view_t, typename b_row_view_t,
+            typename b_nnz_view_t, typename b_scalar_view_t,
+            typename c_row_view_t, typename c_nnz_view_t,
+            typename c_scalar_view_t, typename mpool_type>
+  struct NumericCMEM_CPU;
+
+  template <typename a_row_view_t__, typename a_nnz_view_t__,
+            typename a_scalar_view_t__, typename b_row_view_t__,
+            typename b_nnz_view_t__, typename b_scalar_view_t__,
+            typename c_row_view_t__, typename c_nnz_view_t__,
+            typename c_scalar_view_t__, typename c_nnz_tmp_view_t>
+  struct NumericCMEM;
+
+ private:
+  /**
+   * \brief Numeric phase with speed method
+   */
+  template <typename c_row_view_t, typename c_lno_nnz_view_t,
+            typename c_scalar_nnz_view_t>
+  void KokkosBSPGEMM_numeric_speed(
+      c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_,
+      c_scalar_nnz_view_t valuesC_,
+      KokkosKernels::Impl::ExecSpaceType my_exec_space);
+
+ private:
+  // How many extra bytes are needed to align a scalar_t after an array of
+  // nnz_lno_t, in the worst case? Incurred once per hashmap, which may be per
+  // team or per thread depending on algorithm
+  static constexpr size_t scalarAlignPad =
+      (alignof(scalar_t) > alignof(nnz_lno_t))
+          ? (alignof(scalar_t) - alignof(nnz_lno_t))
+          : 0;
+
+  static constexpr bool exec_gpu =
+      KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
+
+ private:
+  nnz_lno_t block_dim;
+
+ public:
+  //////////////////////////////////////////////////////////////////////////
+  /////BELOW CODE IS TO for kkmem SPGEMM
+  ////DECL IS AT _kkmem.hpp
+  //////////////////////////////////////////////////////////////////////////
+  template <typename a_row_view_t, typename a_nnz_view_t,
+            typename a_scalar_view_t, typename b_row_view_t,
+            typename b_nnz_view_t, typename b_scalar_view_t,
+            typename c_row_view_t, typename c_nnz_view_t,
+            typename c_scalar_view_t, typename pool_memory_type>
+  struct PortableNumericCHASH;
+
+  template <typename c_row_view_t, typename c_lno_nnz_view_t,
+            typename c_scalar_nnz_view_t>
+  void KokkosBSPGEMM_numeric_hash(
+      c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_,
+      c_scalar_nnz_view_t valuesC_,
+      KokkosKernels::Impl::ExecSpaceType my_exec_space);
+
+ public:
+  //////////////////////////////////////////////////////////////////////////
+  /////BELOW CODE IS for public symbolic and numeric functions
+  ////DECL IS AT _def.hpp
+  //////////////////////////////////////////////////////////////////////////
+  template <typename c_row_view_t, typename c_lno_nnz_view_t,
+            typename c_scalar_nnz_view_t>
+  void KokkosBSPGEMM_numeric(c_row_view_t &rowmapC_,
+                             c_lno_nnz_view_t &entriesC_,
+                             c_scalar_nnz_view_t &valuesC_);
+
+  KokkosBSPGEMM(HandleType *handle_, nnz_lno_t m_, nnz_lno_t n_, nnz_lno_t k_,
+                nnz_lno_t block_dim_, const_a_lno_row_view_t row_mapA_,
+                const_a_lno_nnz_view_t entriesA_, bool transposeA_,
+                const_b_lno_row_view_t row_mapB_,
+                const_b_lno_nnz_view_t entriesB_, bool transposeB_)
+      : Base(handle_, m_, n_, k_, row_mapA_, entriesA_, transposeA_, row_mapB_,
+             entriesB_, transposeB_),
+        block_dim(block_dim_) {}
+
+  KokkosBSPGEMM(HandleType *handle_, nnz_lno_t m_, nnz_lno_t n_, nnz_lno_t k_,
+                nnz_lno_t block_dim_, const_a_lno_row_view_t row_mapA_,
+                const_a_lno_nnz_view_t entriesA_,
+                const_a_scalar_nnz_view_t valsA_, bool transposeA_,
+                const_b_lno_row_view_t row_mapB_,
+                const_b_lno_nnz_view_t entriesB_,
+                const_b_scalar_nnz_view_t valsB_, bool transposeB_)
+      : Base(handle_, m_, n_, k_, row_mapA_, entriesA_, valsA_, transposeA_,
+             row_mapB_, entriesB_, valsB_, transposeB_),
+        block_dim(block_dim_) {}
+};
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+#include "KokkosSparse_bspgemm_impl_kkmem.hpp"
+#include "KokkosSparse_bspgemm_impl_speed.hpp"
+#include "KokkosSparse_bspgemm_impl_def.hpp"
+#endif
diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_def.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_def.hpp
new file mode 100644
index 0000000000..36729f39ca
--- /dev/null
+++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_def.hpp
@@ -0,0 +1,81 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+namespace KokkosSparse {
+
+namespace Impl {
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename c_row_view_t, typename c_lno_nnz_view_t,
+          typename c_scalar_nnz_view_t>
+void KokkosBSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                   a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                   b_scalar_nnz_view_t_>::
+    KokkosBSPGEMM_numeric(c_row_view_t &rowmapC_, c_lno_nnz_view_t &entriesC_,
+                          c_scalar_nnz_view_t &valuesC_) {
+  // get the algorithm and execution space.
+  // SPGEMMAlgorithm spgemm_algorithm =
+  // this->handle->get_spgemm_handle()->get_algorithm_type();
+  KokkosKernels::Impl::ExecSpaceType my_exec_space_ =
+      KokkosKernels::Impl::get_exec_space_type<MyExecSpace>();
+
+  if (Base::KOKKOSKERNELS_VERBOSE) {
+    std::cout << "Numeric PHASE" << std::endl;
+  }
+
+  if (Base::spgemm_algorithm == SPGEMM_KK_SPEED ||
+      Base::spgemm_algorithm == SPGEMM_KK_DENSE) {
+    this->KokkosBSPGEMM_numeric_speed(rowmapC_, entriesC_, valuesC_,
+                                      my_exec_space_);
+  } else {
+    this->KokkosBSPGEMM_numeric_hash(rowmapC_, entriesC_, valuesC_,
+                                     my_exec_space_);
+  }
+}
+
+}  // namespace Impl
+}  // namespace KokkosSparse
diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp
new file mode 100644
index 0000000000..aae9d83b5f
--- /dev/null
+++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp
@@ -0,0 +1,1658 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#define HASHSCALAR 107
+
+#include "KokkosKernels_Utils.hpp"
+#include "KokkosKernels_BlockHashmapAccumulator.hpp"
+
+namespace KokkosSparse {
+
+namespace Impl {
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename a_row_view_t, typename a_nnz_view_t,
+          typename a_scalar_view_t, typename b_row_view_t,
+          typename b_nnz_view_t, typename b_scalar_view_t,
+          typename c_row_view_t, typename c_nnz_view_t,
+          typename c_scalar_view_t, typename pool_memory_type>
+struct KokkosBSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                     a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                     b_scalar_nnz_view_t_>::PortableNumericCHASH {
+  using BlockAccumulator = KokkosKernels::Experimental::BlockHashmapAccumulator<
+      nnz_lno_t, nnz_lno_t, scalar_t,
+      KokkosKernels::Experimental::HashOpType::bitwiseAnd>;
+
+  static constexpr auto scalarAlignPad =
+      KokkosBSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::scalarAlignPad;
+  nnz_lno_t numrows;
+  nnz_lno_t block_dim;
+  const nnz_lno_t block_size;
+  size_t block_bytes;
+
+  a_row_view_t row_mapA;
+  a_nnz_view_t entriesA;
+  a_scalar_view_t valuesA;
+
+  b_row_view_t row_mapB;
+  b_nnz_view_t entriesB;
+  b_scalar_view_t valuesB;
+
+  c_row_view_t rowmapC;
+  c_nnz_view_t entriesC;
+  c_scalar_view_t valuesC;
+
+  nnz_lno_t *pEntriesC;
+  scalar_t *pvaluesC;
+  const size_t shared_memory_size;
+  const int vector_size;
+  pool_memory_type memory_space;
+
+  // nnz_lno_t max_nnz;
+  const nnz_lno_t pow2_hash_size;
+  const nnz_lno_t max_nnz;
+  const nnz_lno_t pow2_hash_func;
+  const KokkosKernels::Impl::ExecSpaceType my_exec_space;
+
+  const int unit_memory;  // begins, nexts, and keys. No need for vals yet.
+  int team_size;
+  int thread_memory;
+  nnz_lno_t thread_shmem_key_size;
+  nnz_lno_t thread_shared_memory_hash_func;
+  nnz_lno_t thread_shmem_hash_size;
+
+  nnz_lno_t team_shmem_key_size;
+  nnz_lno_t team_shared_memory_hash_func;
+  nnz_lno_t team_shmem_hash_size;
+
+  nnz_lno_t team_cuckoo_key_size, team_cuckoo_hash_func;
+
+  nnz_lno_t max_first_level_hash_size;
+  row_lno_persistent_work_view_t flops_per_row;
+
+  PortableNumericCHASH(
+      nnz_lno_t block_dim_, nnz_lno_t m_, a_row_view_t row_mapA_,
+      a_nnz_view_t entriesA_, a_scalar_view_t valuesA_,
+
+      b_row_view_t row_mapB_, b_nnz_view_t entriesB_, b_scalar_view_t valuesB_,
+
+      c_row_view_t rowmapC_, c_nnz_view_t entriesC_, c_scalar_view_t valuesC_,
+      size_t shared_memory_size_, int vector_size_, pool_memory_type mpool_,
+      nnz_lno_t min_hash_size, nnz_lno_t max_nnz_, int team_size_,
+      const KokkosKernels::Impl::ExecSpaceType my_exec_space_,
+      double first_level_cut_off, row_lno_persistent_work_view_t flops_per_row_,
+      bool KOKKOSKERNELS_VERBOSE_)
+      : numrows(m_),
+        block_dim(block_dim_),
+        block_size(block_dim_ * block_dim_),
+        block_bytes(sizeof(scalar_t) * block_dim * block_dim),
+        row_mapA(row_mapA_),
+        entriesA(entriesA_),
+        valuesA(valuesA_),
+
+        row_mapB(row_mapB_),
+        entriesB(entriesB_),
+        valuesB(valuesB_),
+
+        rowmapC(rowmapC_),
+        entriesC(entriesC_),
+        valuesC(valuesC_),
+        pEntriesC(entriesC_.data()),
+        pvaluesC(valuesC_.data()),
+        shared_memory_size(shared_memory_size_ / 8 * 8),
+        vector_size(vector_size_),
+        memory_space(mpool_),
+        // max_nnz(),
+        pow2_hash_size(min_hash_size),
+        max_nnz(max_nnz_),
+        pow2_hash_func(min_hash_size - 1),
+        my_exec_space(my_exec_space_),
+        unit_memory(sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + block_bytes),
+        team_size(team_size_),
+        thread_memory((shared_memory_size / 8 / team_size_) * 8),
+        thread_shmem_key_size(),
+        thread_shared_memory_hash_func(),
+        thread_shmem_hash_size(1),
+        team_shmem_key_size(),
+        team_shared_memory_hash_func(),
+        team_shmem_hash_size(1),
+        team_cuckoo_key_size(1),
+        team_cuckoo_hash_func(1),
+        max_first_level_hash_size(1),
+        flops_per_row(flops_per_row_)
+
+  {
+    nnz_lno_t tmp_team_cuckoo_key_size =
+        ((shared_memory_size - sizeof(nnz_lno_t) * 2) /
+         (sizeof(nnz_lno_t) + block_bytes));
+
+    while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size)
+      team_cuckoo_key_size = team_cuckoo_key_size * 2;
+    team_cuckoo_hash_func = team_cuckoo_key_size - 1;
+    team_shmem_key_size =
+        ((shared_memory_size - sizeof(nnz_lno_t) * 4 - scalarAlignPad) /
+         unit_memory);
+    thread_shmem_key_size =
+        ((thread_memory - sizeof(nnz_lno_t) * 4 - scalarAlignPad) /
+         unit_memory);
+    if (KOKKOSKERNELS_VERBOSE_) {
+      std::cout << "\t\tPortableNumericCHASH -- sizeof(scalar_t): "
+                << sizeof(scalar_t)
+                << "  sizeof(nnz_lno_t): " << sizeof(nnz_lno_t)
+                << "  team_size: " << team_size << std::endl;
+      std::cout << "\t\tPortableNumericCHASH -- thread_memory:" << thread_memory
+                << " unit_memory:" << unit_memory
+                << " initial key size:" << thread_shmem_key_size << std::endl;
+      std::cout << "\t\tPortableNumericCHASH -- team shared_memory:"
+                << shared_memory_size << " unit_memory:" << unit_memory
+                << " initial team key size:" << team_shmem_key_size
+                << std::endl;
+    }
+    while (thread_shmem_hash_size * 2 <= thread_shmem_key_size) {
+      thread_shmem_hash_size = thread_shmem_hash_size * 2;
+    }
+    while (team_shmem_hash_size * 2 <= team_shmem_key_size) {
+      team_shmem_hash_size = team_shmem_hash_size * 2;
+    }
+    team_shared_memory_hash_func   = team_shmem_hash_size - 1;
+    thread_shared_memory_hash_func = thread_shmem_hash_size - 1;
+    team_shmem_key_size =
+        team_shmem_key_size +
+        ((team_shmem_key_size - team_shmem_hash_size) * sizeof(nnz_lno_t)) /
+            (sizeof(nnz_lno_t) * 2 + block_bytes);
+    team_shmem_key_size = (team_shmem_key_size >> 1) << 1;
+
+    thread_shmem_key_size =
+        thread_shmem_key_size +
+        ((thread_shmem_key_size - thread_shmem_hash_size) * sizeof(nnz_lno_t)) /
+            (sizeof(nnz_lno_t) * 2 + block_bytes);
+    thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1;
+
+    if (KOKKOSKERNELS_VERBOSE_) {
+      std::cout << "\t\tPortableNumericCHASH -- thread_memory:" << thread_memory
+                << " unit_memory:" << unit_memory
+                << " resized key size:" << thread_shmem_key_size << std::endl;
+      std::cout << "\t\tPortableNumericCHASH -- team shared_memory:"
+                << shared_memory_size << " unit_memory:" << unit_memory
+                << " resized team key size:" << team_shmem_key_size
+                << std::endl;
+    }
+
+    max_first_level_hash_size = first_level_cut_off * team_cuckoo_key_size;
+    if (KOKKOSKERNELS_VERBOSE_) {
+      std::cout << "\t\tPortableNumericCHASH -- thread_memory:" << thread_memory
+                << " unit_memory:" << unit_memory
+                << " initial key size:" << thread_shmem_key_size << std::endl;
+      std::cout << "\t\tPortableNumericCHASH -- team_memory:"
+                << shared_memory_size << " unit_memory:" << unit_memory
+                << " initial team key size:" << team_shmem_key_size
+                << std::endl;
+      std::cout << "\t\tPortableNumericCHASH -- adjusted hashsize:"
+                << thread_shmem_hash_size
+                << " thread_shmem_key_size:" << thread_shmem_key_size
+                << std::endl;
+      std::cout << "\t\tPortableNumericCHASH -- adjusted team hashsize:"
+                << team_shmem_hash_size
+                << " team_shmem_key_size:" << team_shmem_key_size << std::endl;
+      std::cout << "\t\t  team_cuckoo_key_size:" << team_cuckoo_key_size
+                << " team_cuckoo_hash_func:" << team_cuckoo_hash_func
+                << " max_first_level_hash_size:" << max_first_level_hash_size
+                << std::endl;
+      std::cout << "\t\t  pow2_hash_size:" << pow2_hash_size
+                << " pow2_hash_func:" << pow2_hash_func << std::endl;
+    }
+  }
+
+  void set_team_size(int team_size_) {
+    this->team_size     = team_size_;
+    this->thread_memory = (shared_memory_size / 8 / team_size_) * 8;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t get_thread_id(const size_t row_index) const {
+    switch (my_exec_space) {
+      default: return row_index;
+#if defined(KOKKOS_ENABLE_SERIAL)
+      case KokkosKernels::Impl::Exec_SERIAL: return 0;
+#endif
+#if defined(KOKKOS_ENABLE_OPENMP)
+      case KokkosKernels::Impl::Exec_OMP:
+        return Kokkos::OpenMP::impl_hardware_thread_id();
+#endif
+#if defined(KOKKOS_ENABLE_THREADS)
+      case KokkosKernels::Impl::Exec_THREADS:
+        return Kokkos::Threads::impl_hardware_thread_id();
+#endif
+#if defined(KOKKOS_ENABLE_CUDA)
+      case KokkosKernels::Impl::Exec_CUDA: return row_index;
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+      case KokkosKernels::Impl::Exec_HIP: return row_index;
+#endif
+    }
+  }
+
+  // linear probing with tracking.
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const MultiCoreTag4 &,
+                  const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * teamMember.team_size();
+    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(
+        team_row_begin + teamMember.team_size(), numrows);
+
+    volatile nnz_lno_t *tmp = NULL;
+    size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(memory_space.allocate_chunk(tid));
+    }
+
+    nnz_lno_t *used_indices = (nnz_lno_t *)(tmp);
+    tmp += max_nnz;
+    nnz_lno_t *hash_ids = (nnz_lno_t *)(tmp);
+    tmp += pow2_hash_size;
+
+    scalar_t *hash_values =
+        KokkosKernels::Impl::alignPtr<volatile nnz_lno_t *, scalar_t>(tmp);
+
+    BlockAccumulator hm(block_dim, pow2_hash_size, pow2_hash_func, nullptr,
+                        nullptr, hash_ids, hash_values);
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t used_count = 0;
+
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin;
+          for (nnz_lno_t ii = 0; ii < left_work; ++ii) {
+            size_type a_col      = col_begin + ii;
+            nnz_lno_t rowB       = entriesA[a_col];
+            const scalar_t *valA = valuesA.data() + a_col * block_size;
+
+            size_type rowBegin   = row_mapB(rowB);
+            nnz_lno_t left_workB = row_mapB(rowB + 1) - rowBegin;
+
+            for (nnz_lno_t i = 0; i < left_workB; ++i) {
+              const size_type adjind = i + rowBegin;
+              nnz_lno_t b_col_ind    = entriesB[adjind];
+              const scalar_t *valB   = valuesB.data() + adjind * block_size;
+
+              hm.sequential_insert_into_hash_simple(b_col_ind, valA, valB,
+                                                    used_count, used_indices);
+            }
+          }
+          size_type c_row_begin = rowmapC[row_index];
+          hm.sequential_export_values_simple(
+              used_count, used_indices, pEntriesC + c_row_begin,
+              pvaluesC + c_row_begin * block_size);
+        });
+    memory_space.release_chunk(used_indices);
+  }
+
+  // assumes that the vector lane is 1, as in cpus
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const MultiCoreTag &, const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * teamMember.team_size();
+    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(
+        team_row_begin + teamMember.team_size(), numrows);
+
+    BlockAccumulator hm2(block_dim, pow2_hash_size, pow2_hash_func, nullptr,
+                         nullptr, nullptr, nullptr);
+
+    volatile nnz_lno_t *tmp = NULL;
+    size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(memory_space.allocate_chunk(tid));
+    }
+    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *)tmp;
+    tmp += pow2_hash_size;
+
+    hm2.hash_begins = (nnz_lno_t *)(tmp);
+    tmp += pow2_hash_size;
+    hm2.hash_nexts = (nnz_lno_t *)(tmp);
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t globally_used_hash_count = 0;
+          nnz_lno_t used_hash_sizes          = 0;
+
+          const size_type c_row_begin = rowmapC[row_index];
+
+          hm2.keys   = pEntriesC + c_row_begin;
+          hm2.values = pvaluesC + c_row_begin * block_size;
+
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin;
+
+          for (nnz_lno_t ii = 0; ii < left_work; ++ii) {
+            size_type a_col       = col_begin + ii;
+            nnz_lno_t rowB        = entriesA[a_col];
+            const scalar_t *a_val = valuesA.data() + a_col * block_size;
+
+            size_type rowBegin   = row_mapB(rowB);
+            nnz_lno_t left_workB = row_mapB(rowB + 1) - rowBegin;
+
+            for (nnz_lno_t i = 0; i < left_workB; ++i) {
+              const size_type adjind = i + rowBegin;
+              nnz_lno_t b_col_ind    = entriesB[adjind];
+              const scalar_t *b_val  = valuesB.data() + adjind * block_size;
+              // nnz_lno_t hash = (b_col_ind * 107) & pow2_hash_func;
+
+              // this has to be a success, we do not need to check for the
+              // success. int insertion =
+              hm2.sequential_insert_into_hash_mergeAdd_TrackHashes(
+                  b_col_ind, a_val, b_val, &used_hash_sizes,
+                  &globally_used_hash_count, globally_used_hash_indices);
+            }
+          }
+          for (nnz_lno_t i = 0; i < globally_used_hash_count; ++i) {
+            nnz_lno_t dirty_hash        = globally_used_hash_indices[i];
+            hm2.hash_begins[dirty_hash] = -1;
+          }
+        });
+    memory_space.release_chunk(globally_used_hash_indices);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const GPUTag &, const team_member_t &teamMember) const {
+    nnz_lno_t team_row_begin =
+        teamMember.league_rank() * teamMember.team_size();
+    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(
+        team_row_begin + teamMember.team_size(), numrows);
+
+    // int thread_memory = (shared_memory_size / 8 / teamMember.team_size()) *
+    // 8;
+    char *all_shared_memory =
+        (char *)(teamMember.team_shmem().get_shmem(shared_memory_size));
+
+    // shift it to the thread private part
+    all_shared_memory += thread_memory * teamMember.team_rank();
+
+    // used_hash_sizes hold the size of 1st and 2nd level hashes
+    volatile nnz_lno_t *used_hash_sizes =
+        (volatile nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * 2;
+
+    nnz_lno_t *globally_used_hash_count = (nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * 2;
+
+    // int unit_memory = sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + sizeof
+    // (scalar_t) ; //begins, nexts, keys and vals . nnz_lno_t shmem_key_size =
+    // (thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory; if (shmem_key_size
+    // & 1) shmem_key_size -= 1;
+
+    nnz_lno_t *begins = (nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_hash_size;
+
+    // points to the next elements
+    nnz_lno_t *nexts = (nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_key_size;
+
+    // holds the keys
+    nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_key_size;
+    // remainder of shmem allocation for vals
+    scalar_t *vals =
+        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+
+    BlockAccumulator hm(block_dim, thread_shmem_key_size,
+                        thread_shared_memory_hash_func, begins, nexts, keys,
+                        vals);
+
+    BlockAccumulator hm2(block_dim, pow2_hash_size, pow2_hash_func, nullptr,
+                         nullptr, nullptr, nullptr);
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          const size_type c_row_begin = rowmapC[row_index];
+          const size_type c_row_end   = rowmapC[row_index + 1];
+          const nnz_lno_t global_memory_hash_size =
+              nnz_lno_t(c_row_end - c_row_begin);
+
+          bool is_global_alloced                = false;
+          nnz_lno_t *globally_used_hash_indices = NULL;
+
+          if (global_memory_hash_size > thread_shmem_key_size) {
+            volatile nnz_lno_t *tmp = NULL;
+            // size_t tid = get_thread_id(row_index);
+            // the code gets internal compiler error on gcc 4.7.2
+            // assuming that this part only runs on GPUs for now, below fix
+            // has the exact same behaviour and runs okay.
+            size_t tid = row_index;
+
+            while (tmp == NULL) {
+              Kokkos::single(
+                  Kokkos::PerThread(teamMember),
+                  [&](volatile nnz_lno_t *&memptr) {
+                    memptr = (volatile nnz_lno_t *)(memory_space.allocate_chunk(
+                        tid));
+                  },
+                  tmp);
+            }
+
+            is_global_alloced          = true;
+            globally_used_hash_indices = (nnz_lno_t *)tmp;
+            tmp += pow2_hash_size;
+            hm2.hash_begins = (nnz_lno_t *)(tmp);
+            tmp += pow2_hash_size;
+            hm2.hash_nexts = (nnz_lno_t *)(tmp);
+          }
+          hm2.keys   = pEntriesC + c_row_begin;
+          hm2.values = pvaluesC + c_row_begin * block_size;
+
+          // initialize begins.
+          Kokkos::parallel_for(
+              Kokkos::ThreadVectorRange(teamMember, thread_shmem_hash_size),
+              [&](nnz_lno_t i) { begins[i] = -1; });
+
+          // initialize hash usage sizes
+          Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+            used_hash_sizes[0]          = 0;
+            used_hash_sizes[1]          = 0;
+            globally_used_hash_count[0] = 0;
+          });
+
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin;
+          nnz_lno_t ii              = left_work;
+          // for ( nnz_lno_t ii = 0; ii < left_work; ++ii){
+          while (ii-- > 0) {
+            size_type a_col      = col_begin + ii;
+            nnz_lno_t rowB       = entriesA[a_col];
+            const scalar_t *valA = valuesA.data() + a_col * block_size;
+
+            size_type rowBegin   = row_mapB(rowB);
+            nnz_lno_t left_work_ = row_mapB(rowB + 1) - rowBegin;
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(teamMember, left_work_),
+                [&](nnz_lno_t i) {
+                  const size_type adjind = i + rowBegin;
+                  nnz_lno_t b_col_ind    = entriesB[adjind];
+                  const scalar_t *valB   = valuesB.data() + adjind * block_size;
+                  volatile int num_unsuccess =
+                      hm.vector_atomic_insert_into_hash_mergeAdd(
+                          b_col_ind, valA, valB, used_hash_sizes);
+                  if (num_unsuccess) {
+                    hm2.vector_atomic_insert_into_hash_mergeAdd_TrackHashes(
+                        b_col_ind, valA, valB, used_hash_sizes + 1,
+                        globally_used_hash_count, globally_used_hash_indices);
+                  }
+                });
+          }
+
+          if (is_global_alloced) {
+            nnz_lno_t dirty_hashes = globally_used_hash_count[0];
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(teamMember, dirty_hashes),
+                [&](nnz_lno_t i) {
+                  nnz_lno_t dirty_hash        = globally_used_hash_indices[i];
+                  hm2.hash_begins[dirty_hash] = -1;
+                });
+
+            Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+              memory_space.release_chunk(globally_used_hash_indices);
+            });
+          }
+
+          Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+            if (used_hash_sizes[0] > thread_shmem_key_size)
+              used_hash_sizes[0] = thread_shmem_key_size;
+          });
+
+          nnz_lno_t num_elements = used_hash_sizes[0];
+
+          nnz_lno_t written_index = used_hash_sizes[1];
+          Kokkos::parallel_for(
+              Kokkos::ThreadVectorRange(teamMember, num_elements),
+              [&](nnz_lno_t i) {
+                const auto idx = c_row_begin + written_index + i;
+                pEntriesC[idx] = keys[i];
+                kk_block_set(block_dim, pvaluesC + idx * block_size,
+                             vals + i * block_size);
+              });
+        });
+  }
+
+  // one row does not fit into shmem, with thread-flat-parallel
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const GPUTag6 &, const team_member_t &teamMember) const {
+    nnz_lno_t team_row_begin =
+        teamMember.league_rank() * teamMember.team_size();
+    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(
+        team_row_begin + teamMember.team_size(), numrows);
+    char *all_shared_memory =
+        (char *)(teamMember.team_shmem().get_shmem(shared_memory_size));
+
+    // shmem == sizeof(nnz_lno_t)*2 + sizeof(nnz_lno_t)*team_cuckoo_key_size +
+    // sizeof(scalar_t)*nvals
+    const nnz_lno_t init_value = -1;
+    volatile nnz_lno_t *used_hash_sizes =
+        (volatile nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * 2;
+    // holds the keys
+    nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * team_cuckoo_key_size;
+    scalar_t *vals =
+        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+
+    int thread_rank = teamMember.team_rank();
+
+    int vector_rank = 0;
+    typedef typename std::remove_reference<decltype(*used_hash_sizes)>::type
+        atomic_incr_type;
+    Kokkos::parallel_scan(
+        Kokkos::ThreadVectorRange(teamMember, vector_size),
+        [&](const int /* threadid */, int &update, const bool final) {
+          if (final) {
+            vector_rank = update;
+          }
+          update += 1;
+        });
+    int bs           = vector_size * team_size;
+    int vector_shift = thread_rank * vector_size + vector_rank;
+
+    for (nnz_lno_t row_index = team_row_begin; row_index < team_row_end;
+         ++row_index) {
+      if (row_mapA[row_index] == row_mapA[row_index + 1])  // skip empty A rows
+        continue;
+#if 1
+      teamMember.team_barrier();
+#endif
+      const size_type c_row_begin = rowmapC[row_index];
+      const size_type c_row_end   = rowmapC[row_index + 1];
+      const nnz_lno_t c_row_size  = c_row_end - c_row_begin;
+      nnz_lno_t *c_row            = entriesC.data() + c_row_begin;
+      scalar_t *c_row_vals        = valuesC.data() + c_row_begin * block_size;
+      nnz_lno_t *global_acc_row_keys = c_row;
+      scalar_t *global_acc_row_vals  = c_row_vals;
+      volatile nnz_lno_t *tmp        = NULL;
+
+      if (c_row_size > max_first_level_hash_size) {
+        {
+          while (tmp == NULL) {
+            Kokkos::single(
+                Kokkos::PerTeam(teamMember),
+                [&](volatile nnz_lno_t *&memptr) {
+                  memptr = (volatile nnz_lno_t *)(memory_space.allocate_chunk(
+                      row_index));
+                },
+                tmp);
+          }
+          global_acc_row_keys = (nnz_lno_t *)(tmp);
+          global_acc_row_vals =
+              KokkosKernels::Impl::alignPtr<volatile nnz_lno_t *, scalar_t>(
+                  tmp + pow2_hash_size);
+        }
+        // initialize begins.
+        {
+          nnz_lno_t num_threads = pow2_hash_size / vector_size;
+          // not needed as team_cuckoo_key_size is always pow2. +
+          // (team_cuckoo_key_size & (vector_size - 1)) * 1;
+          Kokkos::parallel_for(
+              Kokkos::TeamThreadRange(teamMember, num_threads),
+              [&](nnz_lno_t teamind) {
+                Kokkos::parallel_for(
+                    Kokkos::ThreadVectorRange(teamMember, vector_size),
+                    [&](nnz_lno_t i) {
+                      const auto idx = teamind * vector_size + i;
+                      kk_block_init(block_dim,
+                                    global_acc_row_vals + idx * block_size);
+                    });
+              });
+        }
+      }
+
+      // initialize begins.
+      {
+        nnz_lno_t num_threads = team_cuckoo_key_size / vector_size;
+        // not needed as team_cuckoo_key_size is always pow2. +
+        // (team_cuckoo_key_size & (vector_size - 1)) * 1;
+        Kokkos::parallel_for(
+            Kokkos::TeamThreadRange(teamMember, num_threads),
+            [&](nnz_lno_t teamind) {
+              Kokkos::parallel_for(
+                  Kokkos::ThreadVectorRange(teamMember, vector_size),
+                  [&](nnz_lno_t i) {
+                    const auto idx = teamind * vector_size + i;
+                    keys[idx]      = init_value;
+                    kk_block_init(block_dim, vals + idx * block_size);
+                  });
+            });
+      }
+
+      // initialize hash usage sizes
+      Kokkos::single(Kokkos::PerTeam(teamMember), [&]() {
+        used_hash_sizes[0] = 0;
+        used_hash_sizes[1] = 0;
+      });
+
+      bool insert_is_on                  = true;
+      const size_type a_col_begin_offset = row_mapA[row_index];
+
+      nnz_lno_t a_col_ind   = entriesA[a_col_begin_offset];
+      const scalar_t *a_val = valuesA.data() + a_col_begin_offset * block_size;
+
+      nnz_lno_t current_a_column_offset_inrow = 0;
+      nnz_lno_t flops_on_the_left_of_offsett  = 0;
+      size_type current_b_read_offsett        = row_mapB[a_col_ind];
+      nnz_lno_t current_a_column_flops =
+          row_mapB[a_col_ind + 1] - current_b_read_offsett;
+
+      nnz_lno_t row_flops = flops_per_row(row_index);
+
+#if 1
+      teamMember.team_barrier();
+#endif
+      for (nnz_lno_t vector_read_shift = vector_shift;
+           vector_read_shift < row_flops; vector_read_shift += bs) {
+        {
+          nnz_lno_t my_b_col_shift =
+              vector_read_shift - flops_on_the_left_of_offsett;
+          nnz_lno_t my_b_col = init_value;
+          nnz_lno_t hash     = init_value;
+          int fail           = 0;
+
+          if (my_b_col_shift >= current_a_column_flops) {
+            do {
+              ++current_a_column_offset_inrow;
+              my_b_col_shift -= current_a_column_flops;
+              flops_on_the_left_of_offsett += current_a_column_flops;
+              a_col_ind =
+                  entriesA[a_col_begin_offset + current_a_column_offset_inrow];
+
+              current_b_read_offsett = row_mapB[a_col_ind];
+              current_a_column_flops =
+                  row_mapB[a_col_ind + 1] - current_b_read_offsett;
+            } while (my_b_col_shift >= current_a_column_flops);
+            const auto idx = a_col_begin_offset + current_a_column_offset_inrow;
+            a_val          = valuesA.data() + idx * block_size;
+          }
+
+          const auto idx        = my_b_col_shift + current_b_read_offsett;
+          my_b_col              = entriesB[idx];
+          const scalar_t *b_val = valuesB.data() + idx * block_size;
+          // now insert it to first level hashmap accumulator.
+          hash               = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
+          fail               = 1;
+          bool try_to_insert = true;
+
+          // nnz_lno_t max_tries = team_cuckoo_key_size;
+          nnz_lno_t search_end =
+              team_cuckoo_key_size;  // KOKKOSKERNELS_MACRO_MIN(team_cuckoo_key_size,
+                                     // hash + max_tries);
+          for (nnz_lno_t trial = hash; trial < search_end;) {
+            if (keys[trial] == my_b_col) {
+              kk_vector_block_add_mul(block_dim, vals + trial * block_size,
+                                      a_val, b_val);
+              fail = 0;
+              break;
+            } else if (keys[trial] == init_value) {
+              if (!insert_is_on) {
+                try_to_insert = false;
+                break;
+              } else if (Kokkos::atomic_compare_exchange_strong(
+                             keys + trial, init_value, my_b_col)) {
+                kk_vector_block_add_mul(block_dim, vals + trial * block_size,
+                                        a_val, b_val);
+                Kokkos::atomic_increment(used_hash_sizes);
+                if (used_hash_sizes[0] > max_first_level_hash_size)
+                  insert_is_on = false;
+                fail = 0;
+                break;
+              }
+            } else {
+              ++trial;
+            }
+          }
+          if (fail) {
+            search_end = hash;  // max_tries - (team_cuckoo_key_size -  hash);
+
+            for (nnz_lno_t trial = 0; try_to_insert && trial < search_end;) {
+              if (keys[trial] == my_b_col) {
+                kk_vector_block_add_mul(block_dim, vals + trial * block_size,
+                                        a_val, b_val);
+                fail = 0;
+                break;
+              } else if (keys[trial] == init_value) {
+                if (!insert_is_on) {
+                  break;
+                } else if (Kokkos::atomic_compare_exchange_strong(
+                               keys + trial, init_value, my_b_col)) {
+                  kk_vector_block_add_mul(block_dim, vals + trial * block_size,
+                                          a_val, b_val);
+                  Kokkos::atomic_increment(used_hash_sizes);
+                  if (used_hash_sizes[0] > max_first_level_hash_size)
+                    insert_is_on = false;
+                  fail = 0;
+                  break;
+                }
+              } else {
+                ++trial;
+              }
+            }
+
+            if (fail) {
+              nnz_lno_t new_hash = (my_b_col * HASHSCALAR) & pow2_hash_func;
+
+              for (nnz_lno_t trial = new_hash; trial < pow2_hash_size;) {
+                if (global_acc_row_keys[trial] == my_b_col) {
+                  kk_vector_block_add_mul(
+                      block_dim, global_acc_row_vals + trial * block_size,
+                      a_val, b_val);
+                  // c_row_vals[trial] += my_b_val;
+                  fail = 0;
+                  break;
+                } else if (global_acc_row_keys[trial] == init_value) {
+                  if (Kokkos::atomic_compare_exchange_strong(
+                          global_acc_row_keys + trial, init_value, my_b_col)) {
+                    kk_vector_block_add_mul(
+                        block_dim, global_acc_row_vals + trial * block_size,
+                        a_val, b_val);
+                    // Kokkos::atomic_increment(used_hash_sizes + 1);
+                    // c_row_vals[trial] = my_b_val;
+                    fail = 0;
+                    break;
+                  }
+                } else {
+                  ++trial;
+                }
+              }
+              if (fail) {
+                for (nnz_lno_t trial = 0; trial < new_hash;) {
+                  if (global_acc_row_keys[trial] == my_b_col) {
+                    // c_row_vals[trial] += my_b_val;
+                    kk_vector_block_add_mul(
+                        block_dim, global_acc_row_vals + trial * block_size,
+                        a_val, b_val);
+                    break;
+                  } else if (global_acc_row_keys[trial] == init_value) {
+                    if (Kokkos::atomic_compare_exchange_strong(
+                            global_acc_row_keys + trial, init_value,
+                            my_b_col)) {
+                      // Kokkos::atomic_increment(used_hash_sizes + 1);
+                      kk_vector_block_add_mul(
+                          block_dim, global_acc_row_vals + trial * block_size,
+                          a_val, b_val);
+                      // c_row_vals[trial] = my_b_val;
+                      break;
+                    }
+                  } else {
+                    ++trial;
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+
+      teamMember.team_barrier();
+
+      if (tmp != NULL) {
+        for (nnz_lno_t my_index = vector_shift; my_index < pow2_hash_size;
+             my_index += bs) {
+          nnz_lno_t my_b_col = global_acc_row_keys[my_index];
+          if (my_b_col != init_value) {
+            const scalar_t *b_val = global_acc_row_vals + my_index * block_size;
+            int fail              = 1;
+            {
+              nnz_lno_t trial = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
+              for (nnz_lno_t max_tries = team_cuckoo_key_size; max_tries-- > 0;
+                   trial               = (trial + 1) & team_cuckoo_hash_func) {
+                if (keys[trial] == my_b_col) {
+                  kk_block_add(block_dim, vals + trial * block_size, b_val);
+                  fail = 0;
+                  break;
+                } else if (keys[trial] == init_value) {
+                  break;
+                }
+              }
+            }
+            if (fail) {
+              nnz_lno_t write_index = 0;
+              write_index        = Kokkos::atomic_fetch_add(used_hash_sizes + 1,
+                                                     atomic_incr_type(1));
+              c_row[write_index] = my_b_col;
+              kk_block_set(block_dim, c_row_vals + write_index * block_size,
+                           b_val);
+            }
+            global_acc_row_keys[my_index] = init_value;
+          }
+        }
+
+        teamMember.team_barrier();
+        Kokkos::single(Kokkos::PerTeam(teamMember), [&]() {
+          memory_space.release_chunk(global_acc_row_keys);
+        });
+      }
+
+      for (nnz_lno_t my_index = vector_shift; my_index < team_cuckoo_key_size;
+           my_index += bs) {
+        nnz_lno_t my_key = keys[my_index];
+        if (my_key != init_value) {
+          const scalar_t *my_val = vals + my_index * block_size;
+          nnz_lno_t write_index  = 0;
+          write_index            = Kokkos::atomic_fetch_add(used_hash_sizes + 1,
+                                                 atomic_incr_type(1));
+          c_row[write_index]     = my_key;
+          kk_block_set(block_dim, c_row_vals + write_index * block_size,
+                       my_val);
+        }
+      }
+    }
+  }
+
+  // In this one row fits into shmem with team-flat-parallel
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const GPUTag4 &, const team_member_t &teamMember) const {
+    const nnz_lno_t init_value = -1;
+    nnz_lno_t team_row_begin =
+        teamMember.league_rank() * teamMember.team_size();
+    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(
+        team_row_begin + teamMember.team_size(), numrows);
+
+    // shmem == sizeof(nnz_lno_t)*2 + sizeof(nnz_lno_t)*team_cuckoo_key_size +
+    // sizeof(scalar_t)*nvals
+    char *all_shared_memory =
+        (char *)(teamMember.team_shmem().get_shmem(shared_memory_size));
+
+    volatile nnz_lno_t *used_hash_sizes =
+        (volatile nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * 2;
+
+    // holds the keys
+    nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * team_cuckoo_key_size;
+    scalar_t *vals =
+        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+
+    int thread_rank = teamMember.team_rank();
+
+    int vector_rank = 0;
+    typedef typename std::remove_reference<decltype(*used_hash_sizes)>::type
+        atomic_incr_type;
+    Kokkos::parallel_scan(
+        Kokkos::ThreadVectorRange(teamMember, vector_size),
+        [&](const int /* threadid */, int &update, const bool final) {
+          if (final) {
+            vector_rank = update;
+          }
+          update += 1;
+        });
+    int bs           = vector_size * team_size;
+    int vector_shift = thread_rank * vector_size + vector_rank;
+    for (nnz_lno_t row_index = team_row_begin; row_index < team_row_end;
+         ++row_index) {
+      if (row_mapA[row_index] == row_mapA[row_index + 1])  // skip empty A rows
+        continue;
+#if 1
+      teamMember.team_barrier();
+#endif
+      const size_type c_row_begin = rowmapC[row_index];
+      // const size_type c_row_end = rowmapC[row_index + 1];
+      // const nnz_lno_t c_row_size = c_row_end -  c_row_begin;
+      nnz_lno_t *c_row     = entriesC.data() + c_row_begin;
+      scalar_t *c_row_vals = valuesC.data() + c_row_begin * block_size;
+
+      // initialize begins.
+      {
+        nnz_lno_t num_threads =
+            team_cuckoo_key_size /
+            vector_size;  // not needed as team_cuckoo_key_size is always pow2.
+                          // + (team_cuckoo_key_size & (vector_size - 1)) * 1;
+        Kokkos::parallel_for(
+            Kokkos::TeamThreadRange(teamMember, num_threads),
+            [&](nnz_lno_t teamind) {
+              // nnz_lno_t team_shift = teamind * vector_size;
+              // nnz_lno_t work_to_handle = KOKKOSKERNELS_MACRO_MIN(vector_size,
+              // team_shmem_hash_size - team_shift);
+              Kokkos::parallel_for(
+                  Kokkos::ThreadVectorRange(teamMember, vector_size),
+                  [&](nnz_lno_t i) {
+                    const auto idx = teamind * vector_size + i;
+                    keys[idx]      = init_value;
+                    kk_block_init(block_dim, vals + idx * block_size);
+                  });
+            });
+      }
+
+#if 0
+      teamMember.team_barrier();
+
+      Kokkos::single(Kokkos::PerTeam(teamMember),[&] () {
+
+      for (int i = 0; i < team_shmem_hash_size; ++i){
+    	  if (begins[i] != init_value){
+    		  std::cout << "row_index:" << row_index << " i:" << i << " team_shmem_hash_size:" << team_shmem_hash_size << " is not init_value begins[i]:" << begins[i] << std::endl;
+    	  }
+      }
+      });
+
+      teamMember.team_barrier();
+#endif
+      // initialize hash usage sizes
+      Kokkos::single(Kokkos::PerTeam(teamMember), [&]() {
+        used_hash_sizes[0] = 0;
+        used_hash_sizes[1] = 0;
+#if 0
+        globally_used_hash_count[0] = 0;
+#endif
+      });
+#if 0
+
+      teamMember.team_barrier();
+#endif
+#if 0
+      bool is_global_alloced = false;
+      nnz_lno_t *globally_used_hash_indices = NULL;
+#endif
+      const size_type a_col_begin_offset = row_mapA[row_index];
+
+      nnz_lno_t a_col_ind   = entriesA[a_col_begin_offset];
+      const scalar_t *a_val = valuesA.data() + a_col_begin_offset * block_size;
+
+      nnz_lno_t current_a_column_offset_inrow = 0;
+      nnz_lno_t flops_on_the_left_of_offsett  = 0;
+      size_type current_b_read_offsett        = row_mapB[a_col_ind];
+      nnz_lno_t current_a_column_flops =
+          row_mapB[a_col_ind + 1] - current_b_read_offsett;
+
+      // nnz_lno_t ii = left_work;
+      nnz_lno_t row_flops = flops_per_row(row_index);
+
+#if 1
+      teamMember.team_barrier();
+#endif
+
+      for (nnz_lno_t vector_read_shift = vector_shift;
+           vector_read_shift < row_flops; vector_read_shift += bs) {
+        {
+          nnz_lno_t my_b_col_shift =
+              vector_read_shift - flops_on_the_left_of_offsett;
+          nnz_lno_t my_b_col = init_value;
+          nnz_lno_t hash     = init_value;
+          int fail           = 0;
+
+          if (my_b_col_shift >= current_a_column_flops) {
+            do {
+              ++current_a_column_offset_inrow;
+              my_b_col_shift -= current_a_column_flops;
+              flops_on_the_left_of_offsett += current_a_column_flops;
+              a_col_ind =
+                  entriesA[a_col_begin_offset + current_a_column_offset_inrow];
+
+              current_b_read_offsett = row_mapB[a_col_ind];
+              current_a_column_flops =
+                  row_mapB[a_col_ind + 1] - current_b_read_offsett;
+            } while (my_b_col_shift >= current_a_column_flops);
+            const auto idx = a_col_begin_offset + current_a_column_offset_inrow;
+            a_val          = valuesA.data() + idx * block_size;
+          }
+
+          my_b_col = entriesB[my_b_col_shift + current_b_read_offsett];
+
+          const auto idx        = my_b_col_shift + current_b_read_offsett;
+          const scalar_t *b_val = valuesB.data() + idx * block_size;
+
+          // now insert it to first level hashmap accumulator.
+          hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
+          fail = 1;
+
+          for (nnz_lno_t trial = hash; trial < team_cuckoo_key_size;) {
+            if (keys[trial] == my_b_col) {
+              kk_vector_block_add_mul(block_dim, vals + trial * block_size,
+                                      a_val, b_val);
+              fail = 0;
+              break;
+            } else if (keys[trial] == init_value) {
+              if (Kokkos::atomic_compare_exchange_strong(
+                      keys + trial, init_value, my_b_col)) {
+                kk_vector_block_add_mul(block_dim, vals + trial * block_size,
+                                        a_val, b_val);
+                fail = 0;
+                break;
+              }
+            } else {
+              ++trial;
+            }
+          }
+          if (fail) {
+            for (nnz_lno_t trial = 0; trial < hash;) {
+              if (keys[trial] == my_b_col) {
+                kk_vector_block_add_mul(block_dim, vals + trial * block_size,
+                                        a_val, b_val);
+                fail = 0;
+                break;
+              } else if (keys[trial] == init_value) {
+                if (Kokkos::atomic_compare_exchange_strong(
+                        keys + trial, init_value, my_b_col)) {
+                  kk_vector_block_add_mul(block_dim, vals + trial * block_size,
+                                          a_val, b_val);
+                  fail = 0;
+                  break;
+                }
+              } else {
+                ++trial;
+              }
+            }
+          }
+        }
+      }
+
+      teamMember.team_barrier();
+      for (nnz_lno_t my_index = vector_shift; my_index < team_cuckoo_key_size;
+           my_index += bs) {
+        nnz_lno_t my_key = keys[my_index];
+        if (my_key != init_value) {
+          const scalar_t *my_val = vals + my_index * block_size;
+          nnz_lno_t write_index =
+              Kokkos::atomic_fetch_add(used_hash_sizes, atomic_incr_type(1));
+          c_row[write_index] = my_key;
+          kk_block_set(block_dim, c_row_vals + write_index * block_size,
+                       my_val);
+        }
+      }
+    }
+  }
+
+  size_t team_shmem_size(int /* team_size */) const {
+    return shared_memory_size;
+  }
+};
+
+//
+// * Notes on KokkosBSPGEMM_numeric_hash *
+//
+// Prior to this routine, KokkosBSPGEMM_numeric(...) was called
+//
+//   KokkosBSPGEMM_numeric(...) :
+//     if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP ==
+//     this->spgemm_algorithm) :
+//       call KokkosBSPGEMM_numeric_speed(...)
+//     else:
+//       call  KokkosBSPGEMM_numeric_hash(...)  (this code!)
+//
+//     * NOTE: KokkosBSPGEMM_numeric_hash2(...) is not called
+//
+//
+// KokkosBSPGEMM_numeric_hash:
+//
+// Algorithm selection may be modified as follows
+//
+//   algorithm_to_run: initialized to spgemm_algorithm input to
+//   KokkosBSPGEMM_numeric_hash
+//     * spgemm_algorithm CANNOT be SPGEMM_KK_SPEED or SPGEMM_KK_DENSE
+//
+//  if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP ==
+//  this->spgemm_algorithm) :
+//     if Cuda enabled :
+//       1. perform shmem-size + partition computations (used by
+//       HashMapAccumulator) and flop estimate
+//       2. from results of 1. select from SPGEMM_KK_MEMORY_SPREADTEAM,
+//       SPGEMM_KK_MEMORY_BIGSPREADTEAM, SPGEMM_KK_MEMORY
+//          * Note: These shmem calculations are not passed along to the
+//          PortableNumericCHASH functor used by kernels
+//            TODO check the pre-shmem calculations and functor shmem
+//            calculations consistent - pass shmem values to functor
+//     else :
+//       1. determine if problem is "dense"
+//       2. if dense: call "this->KokkosBSPGEMM_numeric_speed"
+//          else : no change from algorithm_to_run; that is algorithm_to_run ==
+//          SPGEMM_KK || SPGEMM_KK_LP
+//
+//  else :
+//     skip modification of input algorithm
+//
+//
+//
+// Algorithm type matching to kernel Tag:
+//
+//   Policy typedefs with tags found in: KokkosSparse_spgemm_impl.hpp
+//
+//  Cuda algorithm options:
+//   (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) : gpu_team_policy4_t,
+//   i.e. GPUTag4 (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) :
+//   gpu_team_policy6_t,  i.e. GPUTag6 (default == SPGEMM_KK_MEMORY) :
+//   gpu_team_policy_t,  i.e. GPUTag
+//
+//  Non-Cuda host algorithm options:
+//   SPGEMM_KK_LP:
+//     (algorithm_to_run == SPGEMM_KK_LP + Dynamic) :
+//     dynamic_multicore_team_policy4_t,  i.e. MultiCoreTag4 (algorithm_to_run
+//     == SPGEMM_KK_LP + Static) :  dynamic_multicore_team_policy4_t //
+//     typo/bug, should be multicore_team_policy4_t?
+//   else SPGEMM::KKMEM
+//     kernel label: "KOKKOSPARSE::SPGEMM::KKMEM::DYNAMIC" :
+//     dynamic_multicore_team_policy_t,  i.e. MultiCoreTag kernel label:
+//     "KOKKOSPARSE::SPGEMM::KKMEM::STATIC"  : multicore_team_policy_t,  i.e.
+//     MultiCoreTag
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename c_row_view_t, typename c_lno_nnz_view_t,
+          typename c_scalar_nnz_view_t>
+void KokkosBSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                   a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                   b_scalar_nnz_view_t_>::
+    KokkosBSPGEMM_numeric_hash(
+        c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_,
+        c_scalar_nnz_view_t valuesC_,
+        KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space) {
+  if (Base::KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tHASH MODE" << std::endl;
+  }
+  KokkosSparse::SPGEMMAlgorithm algorithm_to_run = this->spgemm_algorithm;
+  nnz_lno_t brows                                = Base::row_mapB.extent(0) - 1;
+  size_type bnnz                                 = Base::valsB.extent(0);
+
+  int suggested_vector_size =
+      this->handle->get_suggested_vector_size(brows, bnnz);
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
+  size_t shmem_size_to_use = Base::shmem_size;
+
+  row_lno_persistent_work_view_t flops_per_row =
+      this->handle->get_spgemm_handle()->row_flops;
+  size_t original_overall_flops =
+      this->handle->get_spgemm_handle()->original_overall_flops;
+  nnz_lno_t max_nnz = this->handle->get_spgemm_handle()->get_max_result_nnz();
+  size_type overall_nnz = this->handle->get_spgemm_handle()->get_c_nnz();
+
+  typedef KokkosKernels::Impl::UniformMemoryPool<MyTempMemorySpace, nnz_lno_t>
+      pool_memory_space;
+  nnz_lno_t min_hash_size = 1;
+  size_t chunksize        = 1;
+  double first_level_cut_off =
+      this->handle->get_spgemm_handle()->get_first_level_hash_cut_off();
+  int hash_scaler =
+      this->handle->get_spgemm_handle()->get_min_hash_size_scale();
+  nnz_lno_t tmp_max_nnz = max_nnz;
+
+  if (hash_scaler == 0) {
+    tmp_max_nnz = KOKKOSKERNELS_MACRO_MAX(
+        max_nnz, nnz_lno_t(this->b_col_cnt / this->concurrency + 1));
+  } else {
+    tmp_max_nnz *= hash_scaler;
+  }
+
+  // START OF SHARED MEMORY SIZE CALCULATIONS
+  // NOTE: the values computed here are not actually passed to functors
+  // requiring shmem, the calculations here are used for algorithm selection
+  const size_t block_bytes = sizeof(scalar_t) * block_dim * block_dim;
+  nnz_lno_t unit_memory =
+      sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + block_bytes;
+  nnz_lno_t team_shmem_key_size =
+      ((shmem_size_to_use - sizeof(nnz_lno_t) * 4 - scalarAlignPad) /
+       unit_memory);
+  // alignment padding is per-thread for algorithms with per-thread hashmap
+  nnz_lno_t thread_memory =
+      ((shmem_size_to_use / suggested_team_size - scalarAlignPad) / 8) * 8;
+
+  nnz_lno_t thread_shmem_key_size =
+      ((thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory);
+  if (Base::KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tinitial PortableNumericCHASH -- thread_memory:"
+              << thread_memory << " unit_memory:" << unit_memory
+              << " initial key size:" << thread_shmem_key_size << std::endl;
+    std::cout << "\t\tinitial PortableNumericCHASH -- team_memory:"
+              << shmem_size_to_use << " unit_memory:" << unit_memory
+              << " initial team key size:" << team_shmem_key_size << std::endl;
+  }
+  nnz_lno_t thread_shmem_hash_size = 1;
+  while (thread_shmem_hash_size * 2 <= thread_shmem_key_size) {
+    thread_shmem_hash_size = thread_shmem_hash_size * 2;
+  }
+  nnz_lno_t team_shmem_hash_size = 1;
+  while (team_shmem_hash_size * 2 <= team_shmem_key_size) {
+    team_shmem_hash_size = team_shmem_hash_size * 2;
+  }
+  // nnz_lno_t team_shared_memory_hash_func = team_shmem_hash_size - 1;
+
+  team_shmem_key_size =
+      team_shmem_key_size +
+      ((team_shmem_key_size - team_shmem_hash_size) * sizeof(nnz_lno_t)) /
+          (sizeof(nnz_lno_t) * 2 + block_bytes);
+  team_shmem_key_size = (team_shmem_key_size >> 1) << 1;
+
+  thread_shmem_key_size =
+      thread_shmem_key_size +
+      ((thread_shmem_key_size - thread_shmem_hash_size) * sizeof(nnz_lno_t)) /
+          (sizeof(nnz_lno_t) * 2 + block_bytes);
+  thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1;
+
+  // choose parameters
+  if (this->spgemm_algorithm == SPGEMM_KK ||
+      SPGEMM_KK_LP == this->spgemm_algorithm) {
+    if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
+      // then chose the best method and parameters.
+      size_type average_row_nnz = 0;
+      size_t average_row_flops  = 0;
+      if (this->a_row_cnt > 0) {
+        average_row_nnz   = overall_nnz / this->a_row_cnt;
+        average_row_flops = original_overall_flops / this->a_row_cnt;
+      }
+      int vector_length_max =
+          KokkosKernels::Impl::kk_get_max_vector_size<MyExecSpace>();
+      // if we have very low flops per row, or our maximum number of nnz is
+      // prett small, then we do row-base algorithm.
+      if (SPGEMM_KK_LP != this->spgemm_algorithm &&
+          (average_row_nnz < (size_type)vector_length_max ||
+           average_row_flops < 256)) {
+        algorithm_to_run = SPGEMM_KK_MEMORY;
+        // if (average_row_nnz / double (thread_shmem_key_size) > 1.5)
+        while (average_row_nnz > size_type(thread_shmem_key_size) &&
+               suggested_vector_size < vector_length_max) {
+          suggested_vector_size = suggested_vector_size * 2;
+          suggested_vector_size =
+              KOKKOSKERNELS_MACRO_MIN(vector_length_max, suggested_vector_size);
+          suggested_team_size =
+              this->handle->get_suggested_team_size(suggested_vector_size);
+          thread_memory = (shmem_size_to_use / 8 / suggested_team_size) * 8;
+          thread_shmem_key_size =
+              ((thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory);
+          thread_shmem_hash_size = 1;
+          while (thread_shmem_hash_size * 2 <= thread_shmem_key_size) {
+            thread_shmem_hash_size = thread_shmem_hash_size * 2;
+          }
+          thread_shmem_key_size =
+              thread_shmem_key_size +
+              ((thread_shmem_key_size - thread_shmem_hash_size) *
+                   sizeof(nnz_lno_t) -
+               scalarAlignPad) /
+                  (sizeof(nnz_lno_t) * 2 + block_bytes);
+          thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1;
+        }
+
+        if (Base::KOKKOSKERNELS_VERBOSE) {
+          std::cout << "\t\t\tRunning KKMEM with suggested_vector_size:"
+                    << suggested_vector_size
+                    << " suggested_team_size:" << suggested_team_size
+                    << std::endl;
+        }
+      } else {
+        nnz_lno_t tmp_team_cuckoo_key_size =
+            ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) /
+             (sizeof(nnz_lno_t) + block_bytes));
+        int team_cuckoo_key_size = 1;
+        while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size)
+          team_cuckoo_key_size = team_cuckoo_key_size * 2;
+        suggested_vector_size = vector_length_max;
+        suggested_team_size =
+            this->handle->get_suggested_team_size(suggested_vector_size);
+        algorithm_to_run = SPGEMM_KK_MEMORY_BIGSPREADTEAM;
+        while (average_row_nnz <
+               team_cuckoo_key_size / 2 *
+                   (KOKKOSKERNELS_MACRO_MIN(first_level_cut_off + 0.05, 1))) {
+          shmem_size_to_use = shmem_size_to_use / 2;
+          tmp_team_cuckoo_key_size =
+              ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) /
+               (sizeof(nnz_lno_t) + block_bytes));
+          team_cuckoo_key_size = 1;
+          while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size)
+            team_cuckoo_key_size = team_cuckoo_key_size * 2;
+
+          suggested_team_size = suggested_team_size / 2;
+        }
+        if (average_row_flops >
+                size_t(2) * suggested_team_size * suggested_vector_size &&
+            average_row_nnz >
+                size_type(team_cuckoo_key_size) *
+                    (KOKKOSKERNELS_MACRO_MIN(first_level_cut_off + 0.05, 1))) {
+          shmem_size_to_use = shmem_size_to_use * 2;
+          tmp_team_cuckoo_key_size =
+              ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) /
+               (sizeof(nnz_lno_t) + block_bytes));
+          team_cuckoo_key_size = 1;
+          while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size)
+            team_cuckoo_key_size = team_cuckoo_key_size * 2;
+          suggested_team_size = suggested_team_size * 2;
+        }
+#ifdef FIRSTPARAMS
+        suggested_team_size = KOKKOSKERNELS_MACRO_MAX(4, suggested_team_size);
+#else
+        suggested_team_size = KOKKOSKERNELS_MACRO_MAX(2, suggested_team_size);
+#endif
+        if (max_nnz <
+            team_cuckoo_key_size *
+                KOKKOSKERNELS_MACRO_MIN(first_level_cut_off + 0.20, 1)) {
+          algorithm_to_run = SPGEMM_KK_MEMORY_SPREADTEAM;
+          if (Base::KOKKOSKERNELS_VERBOSE) {
+            std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY_SPREADTEAM with "
+                         "suggested_vector_size:"
+                      << suggested_vector_size
+                      << " suggested_team_size:" << suggested_team_size
+                      << " shmem_size_to_use:" << shmem_size_to_use
+                      << std::endl;
+          }
+        } else {
+          if (Base::KOKKOSKERNELS_VERBOSE) {
+            std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY_BIGSPREADTEAM with "
+                         "suggested_vector_size:"
+                      << suggested_vector_size
+                      << " suggested_team_size:" << suggested_team_size
+                      << " shmem_size_to_use:" << shmem_size_to_use
+                      << std::endl;
+          }
+        }
+      }
+    } else {
+      bool run_dense = false;
+      nnz_lno_t max_column_cut_off =
+          this->handle->get_spgemm_handle()->MaxColDenseAcc;
+      nnz_lno_t col_size = this->b_col_cnt;
+      if (col_size < max_column_cut_off) {
+        run_dense = true;
+        if (Base::KOKKOSKERNELS_VERBOSE) {
+          std::cout << "\t\t\tRunning SPGEMM_KK_DENSE col_size:" << col_size
+                    << " max_column_cut_off:" << max_column_cut_off
+                    << std::endl;
+        }
+      } else {
+        // round up maxNumRoughNonzeros to closest power of 2.
+        nnz_lno_t tmp_min_hash_size = 1;
+        while (tmp_max_nnz > tmp_min_hash_size) {
+          tmp_min_hash_size *= 4;
+        }
+
+        size_t kkmem_chunksize =
+            tmp_min_hash_size;                 // this is for used hash indices
+        kkmem_chunksize += tmp_min_hash_size;  // this is for the hash begins
+        kkmem_chunksize += max_nnz;            // this is for hash nexts
+        kkmem_chunksize = kkmem_chunksize * sizeof(nnz_lno_t) + scalarAlignPad;
+        size_t dense_chunksize =
+            (col_size + col_size / block_bytes + 1) * block_bytes;
+
+        if (kkmem_chunksize >= dense_chunksize * 0.5) {
+          run_dense = true;
+          if (Base::KOKKOSKERNELS_VERBOSE) {
+            std::cout << "\t\t\tRunning SPGEMM_KK_SPEED kkmem_chunksize:"
+                      << kkmem_chunksize
+                      << " dense_chunksize:" << dense_chunksize << std::endl;
+          }
+        } else {
+          run_dense = false;
+          if (Base::KOKKOSKERNELS_VERBOSE) {
+            std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY col_size:" << col_size
+                      << " max_column_cut_off:" << max_column_cut_off
+                      << std::endl;
+          }
+        }
+      }
+
+      if (run_dense) {
+        this->KokkosBSPGEMM_numeric_speed(rowmapC_, entriesC_, valuesC_,
+                                          lcl_my_exec_space);
+        return;
+      }
+    }
+  }
+  if (Base::KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tPortableNumericCHASH -- adjusted hashsize:"
+              << thread_shmem_hash_size
+              << " thread_shmem_key_size:" << thread_shmem_key_size
+              << std::endl;
+    std::cout << "\t\tPortableNumericCHASH -- adjusted team hashsize:"
+              << team_shmem_hash_size
+              << " team_shmem_key_size:" << team_shmem_key_size << std::endl;
+  }
+  // END OF SHARED MEMORY SIZE CALCULATIONS
+
+  // required memory for L2
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<
+          typename HandleType::HandleExecSpace>()) {
+    if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) {
+      tmp_max_nnz = 1;
+    } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) {
+    } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGTEAM ||
+               algorithm_to_run == SPGEMM_KK_MEMORY_TEAM) {
+      // tmp_max_nnz -= team_shmem_key_size;
+    } else {
+      // tmp_max_nnz -= thread_shmem_key_size;
+    }
+  }
+
+  // START SIZE CALCULATIONS FOR MEMORYPOOL
+  if (algorithm_to_run == SPGEMM_KK_LP) {
+    while (tmp_max_nnz > min_hash_size) {
+      min_hash_size *= 4;
+    }
+    chunksize = min_hash_size;    // this is for used hash keys
+    chunksize += max_nnz;         // this is for used hash keys
+    chunksize += scalarAlignPad;  // for padding betwen keys and values
+    chunksize += min_hash_size * block_bytes /
+                 sizeof(nnz_lno_t);  // this is for the hash values
+  } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) {
+    while (tmp_max_nnz > min_hash_size) {
+      min_hash_size *= 2;  // try to keep it as low as possible because hashes
+                           // are not tracked.
+    }
+    chunksize = min_hash_size;    // this is for used hash keys
+    chunksize += scalarAlignPad;  // for padding between keys and values
+    chunksize += min_hash_size * block_bytes /
+                 sizeof(nnz_lno_t);  // this is for the hash values
+  } else {
+    while (tmp_max_nnz > min_hash_size) {
+      min_hash_size *= 4;
+    }
+    chunksize = min_hash_size;   // this is for used hash indices
+    chunksize += min_hash_size;  // this is for the hash begins
+    chunksize += max_nnz;        // this is for hash nexts
+  }
+
+  nnz_lno_t num_chunks =
+      this->template compute_num_pool_chunks<pool_memory_space>(
+          chunksize * sizeof(nnz_lno_t),
+          this->concurrency / suggested_vector_size);
+
+  // END SIZE CALCULATIONS FOR MEMORYPOOL
+
+  if (this->KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\t max_nnz: " << max_nnz
+              << " min_hash_size:" << min_hash_size
+              << " concurrency:" << this->concurrency
+              << " MyExecSpace::concurrency():" << MyExecSpace::concurrency()
+              << " numchunks:" << num_chunks << std::endl;
+  }
+
+  KokkosKernels::Impl::PoolType my_pool_type =
+      KokkosKernels::Impl::OneThread2OneChunk;
+
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
+    my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
+  }
+
+  Kokkos::Timer timer1;
+  pool_memory_space m_space(num_chunks, chunksize, -1, my_pool_type);
+  MyExecSpace().fence();
+
+  if (this->KOKKOSKERNELS_VERBOSE) {
+    m_space.print_memory_pool();
+    std::cout << "\t\tPool Alloc Time:" << timer1.seconds() << std::endl;
+    std::cout << "\t\tPool Size(MB):"
+              << sizeof(nnz_lno_t) * (num_chunks * chunksize) / 1024. / 1024.
+              << std::endl;
+  }
+
+  PortableNumericCHASH<
+      const_a_lno_row_view_t, const_a_lno_nnz_view_t, const_a_scalar_nnz_view_t,
+      const_b_lno_row_view_t, const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t,
+      c_row_view_t, c_lno_nnz_view_t, c_scalar_nnz_view_t, pool_memory_space>
+      sc(block_dim, this->a_row_cnt, Base::row_mapA, Base::entriesA,
+         Base::valsA, Base::row_mapB, Base::entriesB, Base::valsB,
+
+         rowmapC_, entriesC_, valuesC_, shmem_size_to_use,
+         suggested_vector_size, m_space, min_hash_size, max_nnz,
+         suggested_team_size,
+
+         lcl_my_exec_space, first_level_cut_off, flops_per_row,
+         this->KOKKOSKERNELS_VERBOSE);
+
+  if (this->KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tvector_size:" << suggested_vector_size
+              << " suggested_team_size:" << suggested_team_size << std::endl;
+  }
+  timer1.reset();
+
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
+    if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) {
+      if (thread_shmem_key_size <= 0) {
+        std::cout << "KokkosBSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: "
+                     "Insufficient shmem available for key for hash map "
+                     "accumulator - Terminating"
+                  << std::endl;
+        std::cout << "    thread_shmem_key_size = " << thread_shmem_key_size
+                  << std::endl;
+        throw std::runtime_error(
+            " KokkosBSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: "
+            "Insufficient shmem available for key for hash map accumulator ");
+      }
+      int max_team_size = gpu_team_policy4_t(1, 1, suggested_vector_size)
+                              .team_size_max(sc, Kokkos::ParallelForTag());
+      int team_size = std::min(suggested_team_size, max_team_size);
+      sc.set_team_size(team_size);
+      Kokkos::parallel_for(
+          "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_SPREADTEAM",
+          gpu_team_policy4_t((this->a_row_cnt + team_size - 1) / team_size,
+                             team_size, suggested_vector_size),
+          sc);
+      MyExecSpace().fence();
+
+    } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) {
+      if (thread_shmem_key_size <= 0) {
+        std::cout << "KokkosBSPGEMM_numeric_hash "
+                     "SPGEMM_KK_MEMORY_BIGSPREADTEAM: Insufficient shmem "
+                     "available for key for hash map accumulator - Terminating"
+                  << std::endl;
+        std::cout << "    thread_shmem_key_size = " << thread_shmem_key_size
+                  << std::endl;
+        throw std::runtime_error(
+            " KokkosBSPGEMM_numeric_hash SPGEMM_KK_MEMORY_BIGSPREADTEAM: "
+            "Insufficient shmem available for key for hash map accumulator ");
+      }
+      int max_team_size = gpu_team_policy6_t(1, 1, suggested_vector_size)
+                              .team_size_max(sc, Kokkos::ParallelForTag());
+      int team_size = std::min(suggested_team_size, max_team_size);
+      sc.set_team_size(team_size);
+      Kokkos::parallel_for(
+          "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_BIGSPREADTEAM",
+          gpu_team_policy6_t((this->a_row_cnt + team_size - 1) / team_size,
+                             team_size, suggested_vector_size),
+          sc);
+    } else {
+      if (team_shmem_key_size <= 0) {
+        std::cout << "KokkosBSPGEMM_numeric_hash SPGEMM_KK_MEMORY: "
+                     "Insufficient shmem "
+                     "available for key for hash map accumulator - Terminating"
+                  << std::endl;
+        std::cout << "    team_shmem_key_size = " << team_shmem_key_size
+                  << std::endl;
+        throw std::runtime_error(
+            " KokkosBSPGEMM_numeric_hash SPGEMM_KK_MEMORY: Insufficient shmem "
+            "available for key for hash map accumulator ");
+      }
+      int max_team_size = gpu_team_policy_t(1, 1, suggested_vector_size)
+                              .team_size_max(sc, Kokkos::ParallelForTag());
+      int team_size = std::min(suggested_team_size, max_team_size);
+      sc.set_team_size(team_size);
+      Kokkos::parallel_for(
+          "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY",
+          gpu_team_policy_t((this->a_row_cnt + team_size - 1) / team_size,
+                            team_size, suggested_vector_size),
+          sc);
+    }
+    MyExecSpace().fence();
+  } else {
+    if (algorithm_to_run == SPGEMM_KK_LP) {
+      if (Base::use_dynamic_schedule) {
+        Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_LP::DYNAMIC",
+                             dynamic_multicore_team_policy4_t(
+                                 (this->a_row_cnt + suggested_team_size - 1) /
+                                     suggested_team_size,
+                                 suggested_team_size, suggested_vector_size),
+                             sc);
+      } else {
+        Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_LP::STATIC",
+                             multicore_team_policy4_t(
+                                 (this->a_row_cnt + suggested_team_size - 1) /
+                                     suggested_team_size,
+                                 suggested_team_size, suggested_vector_size),
+                             sc);
+      }
+    } else {
+      if (Base::use_dynamic_schedule) {
+        Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::KKMEM::DYNAMIC",
+                             dynamic_multicore_team_policy_t(
+                                 (this->a_row_cnt + suggested_team_size - 1) /
+                                     suggested_team_size,
+                                 suggested_team_size, suggested_vector_size),
+                             sc);
+      } else {
+        Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::KKMEM::STATIC",
+                             multicore_team_policy_t(
+                                 (this->a_row_cnt + suggested_team_size - 1) /
+                                     suggested_team_size,
+                                 suggested_team_size, suggested_vector_size),
+                             sc);
+      }
+    }
+    MyExecSpace().fence();
+  }
+
+  if (this->KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl;
+  }
+}
+
+}  // namespace Impl
+}  // namespace KokkosSparse
diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp
new file mode 100644
index 0000000000..312ba22f8a
--- /dev/null
+++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp
@@ -0,0 +1,182 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOSSPARSE_BSPGEMM_DEBUG_HPP_
+#define KOKKOSSPARSE_BSPGEMM_DEBUG_HPP_
+#include "KokkosKernels_helpers.hpp"
+#include "KokkosBatched_Gemm_Serial_Internal.hpp"
+#include <cstring>
+
+namespace KokkosSparse {
+
+namespace Impl {
+
+template <typename data_view_t>
+using kk_subview1d =
+    decltype(Kokkos::subview(data_view_t(), Kokkos::make_pair(0, 0)));
+
+// Returns subview
+template <typename data_view_t, typename size_type, typename lno_t>
+KOKKOS_INLINE_FUNCTION kk_subview1d<data_view_t> get_block(
+    data_view_t data, size_type block_index, lno_t block_size) {
+  const auto i = block_index * block_size;
+  return Kokkos::subview(data, Kokkos::make_pair(i, i + block_size));
+}
+
+template <typename KernelHandle, typename alno_row_view_t_,
+          typename alno_nnz_view_t_, typename ascalar_nnz_view_t_,
+          typename blno_row_view_t_, typename blno_nnz_view_t_,
+          typename bscalar_nnz_view_t_, typename clno_row_view_t_,
+          typename clno_nnz_view_t_, typename cscalar_nnz_view_t_>
+void bspgemm_debug_numeric(KernelHandle* /* handle */,
+                           typename KernelHandle::nnz_lno_t m,
+                           typename KernelHandle::nnz_lno_t /* n */,
+                           typename KernelHandle::nnz_lno_t k,
+                           typename KernelHandle::nnz_lno_t block_dim,
+                           alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA,
+                           ascalar_nnz_view_t_ valuesA,
+
+                           bool /* transposeA */, blno_row_view_t_ row_mapB,
+                           blno_nnz_view_t_ entriesB,
+                           bscalar_nnz_view_t_ valuesB, bool /* transposeB */,
+                           clno_row_view_t_ row_mapC, clno_nnz_view_t_ entriesC,
+                           cscalar_nnz_view_t_ valuesC) {
+  typename alno_row_view_t_::HostMirror h_rma =
+      Kokkos::create_mirror_view(row_mapA);
+  Kokkos::deep_copy(h_rma, row_mapA);
+  typename alno_nnz_view_t_::HostMirror h_enta =
+      Kokkos::create_mirror_view(entriesA);
+  Kokkos::deep_copy(h_enta, entriesA);
+  typename ascalar_nnz_view_t_::HostMirror h_vala =
+      Kokkos::create_mirror_view(valuesA);
+  Kokkos::deep_copy(h_vala, valuesA);
+
+  typename blno_row_view_t_::HostMirror h_rmb =
+      Kokkos::create_mirror_view(row_mapB);
+  Kokkos::deep_copy(h_rmb, row_mapB);
+  typename blno_nnz_view_t_::HostMirror h_entb =
+      Kokkos::create_mirror_view(entriesB);
+  Kokkos::deep_copy(h_entb, entriesB);
+  typename bscalar_nnz_view_t_::HostMirror h_valb =
+      Kokkos::create_mirror_view(valuesB);
+  Kokkos::deep_copy(h_valb, valuesB);
+  typename clno_row_view_t_::HostMirror h_rmc =
+      Kokkos::create_mirror_view(row_mapC);
+  Kokkos::deep_copy(h_rmc, row_mapC);
+
+  typename clno_nnz_view_t_::HostMirror h_entc =
+      Kokkos::create_mirror_view(entriesC);
+  typename cscalar_nnz_view_t_::HostMirror h_valc =
+      Kokkos::create_mirror_view(valuesC);
+  Kokkos::fence();
+
+  typedef typename KernelHandle::nnz_lno_t lno_t;
+  typedef typename KernelHandle::size_type size_type;
+  typedef typename KernelHandle::nnz_scalar_t scalar_t;
+  typedef KokkosBatched::SerialGemmInternal<
+      KokkosBatched::Algo::Gemm::Unblocked>
+      GEMM;
+
+  const auto block_size = block_dim * block_dim;
+  const auto ZERO       = static_cast<scalar_t>(0);
+  const auto ONE        = static_cast<scalar_t>(1);
+
+  typename cscalar_nnz_view_t_::HostMirror accumulator("acc", k * block_size);
+  Kokkos::deep_copy(accumulator, ZERO);
+  Kokkos::fence();
+  std::vector<bool> acc_flag(k, false);
+
+  h_rmc(0) = 0;
+  for (lno_t i = 0; i < m; ++i) {
+    const size_type a_row_begin = h_rma(i);
+    const size_type a_row_end   = h_rma(i + 1);
+    lno_t a_row_size            = a_row_end - a_row_begin;
+
+    size_type c_row_begin    = h_rmc(i);
+    lno_t c_row_size         = h_rmc(i + 1) - c_row_begin;
+    lno_t c_row_size_counter = 0;
+
+    for (lno_t j = 0; j < a_row_size; ++j) {
+      size_type a_ind             = a_row_begin + j;
+      lno_t col                   = h_enta(a_ind);
+      auto a_val                  = &h_vala(a_ind * block_size);
+      const size_type b_row_begin = h_rmb(col);
+      const size_type b_row_end   = h_rmb(col + 1);
+      lno_t b_row_size            = b_row_end - b_row_begin;
+      for (lno_t z = 0; z < b_row_size; ++z) {
+        size_type b_ind = b_row_begin + z;
+        lno_t b_col     = h_entb(b_ind);
+        auto b_val      = &h_valb(b_ind * block_size);
+
+        if (acc_flag[b_col] == false) {
+          acc_flag[b_col]                            = true;
+          h_entc(c_row_begin + c_row_size_counter++) = b_col;
+        }
+        // accumulator(b_col) += a_val * b_val
+        auto acc = get_block(accumulator, b_col, block_size);
+        GEMM::invoke(block_dim, block_dim, block_dim, ONE, a_val, block_dim, 1,
+                     b_val, block_dim, 1, ONE, acc.data(), block_dim, 1);
+      }
+    }
+
+    // if (i == 0) std::cout << "result_cols" << std::endl;
+
+    for (lno_t j = 0; j < c_row_size; ++j) {
+      size_type c_ind  = c_row_begin + j;
+      lno_t result_col = h_entc(c_ind);
+      auto acc         = get_block(accumulator, result_col, block_size);
+      Kokkos::deep_copy(get_block(h_valc, c_ind, block_size), acc);
+      Kokkos::deep_copy(acc, ZERO);
+      Kokkos::fence();
+      acc_flag[result_col] = false;
+    }
+  }
+
+  Kokkos::deep_copy(entriesC, h_entc);
+  Kokkos::deep_copy(valuesC, h_valc);
+  Kokkos::fence();
+}
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+#endif
diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp
new file mode 100644
index 0000000000..372e5d10dd
--- /dev/null
+++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp
@@ -0,0 +1,657 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include "KokkosKernels_Utils.hpp"
+
+namespace KokkosSparse {
+
+namespace Impl {
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename a_row_view_t, typename a_nnz_view_t,
+          typename a_scalar_view_t, typename b_row_view_t,
+          typename b_nnz_view_t, typename b_scalar_view_t,
+          typename c_row_view_t, typename c_nnz_view_t,
+          typename c_scalar_view_t, typename mpool_type>
+struct KokkosBSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                     a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                     b_scalar_nnz_view_t_>::NumericCMEM_CPU {
+  using BSPGEMM = KokkosBSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                                a_scalar_nnz_view_t_, b_lno_row_view_t_,
+                                b_lno_nnz_view_t_, b_scalar_nnz_view_t_>;
+
+  nnz_lno_t numrows;
+  nnz_lno_t numcols;
+  nnz_lno_t block_dim;
+  nnz_lno_t block_size;
+
+  a_row_view_t row_mapA;
+  a_nnz_view_t entriesA;
+  a_scalar_view_t valuesA;
+
+  b_row_view_t row_mapB;
+  b_nnz_view_t entriesB;
+  b_scalar_view_t valuesB;
+
+  c_row_view_t rowmapC;
+  c_nnz_view_t entriesC;
+  c_scalar_view_t valuesC;
+  mpool_type memory_space;
+
+  nnz_lno_t *pEntriesC;
+  scalar_t *pVals;
+  const KokkosKernels::Impl::ExecSpaceType my_exec_space;
+  const nnz_lno_t team_work_size;
+
+  NumericCMEM_CPU(nnz_lno_t m_, nnz_lno_t k_, nnz_lno_t block_dim_,
+                  a_row_view_t row_mapA_, a_nnz_view_t entriesA_,
+                  a_scalar_view_t valuesA_,
+
+                  b_row_view_t row_mapB_, b_nnz_view_t entriesB_,
+                  b_scalar_view_t valuesB_,
+
+                  c_row_view_t rowmapC_, c_nnz_view_t entriesC_,
+                  c_scalar_view_t valuesC_, mpool_type memory_space_,
+                  const KokkosKernels::Impl::ExecSpaceType my_exec_space_,
+                  nnz_lno_t team_row_chunk_size)
+      : numrows(m_),
+        numcols(k_),
+        block_dim(block_dim_),
+        block_size(block_dim_ * block_dim_),
+        row_mapA(row_mapA_),
+        entriesA(entriesA_),
+        valuesA(valuesA_),
+
+        row_mapB(row_mapB_),
+        entriesB(entriesB_),
+        valuesB(valuesB_),
+
+        rowmapC(rowmapC_),
+        entriesC(entriesC_),
+        valuesC(valuesC_),
+        memory_space(memory_space_),
+        pEntriesC(entriesC_.data()),
+        pVals(valuesC.data()),
+        my_exec_space(my_exec_space_),
+        team_work_size(team_row_chunk_size) {}
+
+  KOKKOS_INLINE_FUNCTION
+  size_t get_thread_id(const size_t row_index) const {
+    switch (my_exec_space) {
+      default: return row_index;
+#if defined(KOKKOS_ENABLE_SERIAL)
+      case KokkosKernels::Impl::Exec_SERIAL: return 0;
+#endif
+#if defined(KOKKOS_ENABLE_OPENMP)
+      case KokkosKernels::Impl::Exec_OMP:
+        return Kokkos::OpenMP::impl_hardware_thread_id();
+#endif
+#if defined(KOKKOS_ENABLE_THREADS)
+      case KokkosKernels::Impl::Exec_THREADS:
+        return Kokkos::Threads::impl_hardware_thread_id();
+#endif
+#if defined(KOKKOS_ENABLE_CUDA)
+      case KokkosKernels::Impl::Exec_CUDA: return row_index;
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+      case KokkosKernels::Impl::Exec_HIP: return row_index;
+#endif
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const MultiCoreTag &, const team_member_t &teamMember) const {
+    nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
+
+    scalar_t *dense_accum = NULL;
+    size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
+    while (dense_accum == NULL) {
+      dense_accum = (scalar_t *)(memory_space.allocate_chunk(tid));
+    }
+    char *marker = (char *)(dense_accum + numcols * block_size);
+
+    // Performs C[row_index,b_col_ind] += A[row_index,rowB] * B[rowB,b_col_ind]
+    // using dense_accum[col] to accumulate scalar values,
+    // marker[col] for boolean flags denoting initialized accumulators
+    // and col=pEntriesC[i] to index sparse column indices.
+    // Note: each CPU thread works on its own row, thus no need for locking.
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          const size_type c_row_begin = rowmapC[row_index];
+          nnz_lno_t *myentries        = pEntriesC + c_row_begin;
+          scalar_t *myvals            = pVals + c_row_begin * block_size;
+
+          nnz_lno_t current_col_index = 0;
+          const size_type col_begin   = row_mapA[row_index];
+          const nnz_lno_t row_size    = row_mapA[row_index + 1] - col_begin;
+
+          for (nnz_lno_t colind = 0; colind < row_size; ++colind) {
+            size_type a_col       = colind + col_begin;
+            nnz_lno_t rowB        = entriesA[a_col];
+            const scalar_t *a_val = &valuesA[a_col * block_size];
+
+            size_type rowBegin  = row_mapB(rowB);
+            nnz_lno_t left_work = row_mapB(rowB + 1) - rowBegin;
+            for (int i = 0; i < left_work; ++i) {
+              const size_type adjind = i + rowBegin;
+              nnz_lno_t b_col_ind    = entriesB[adjind];
+              const scalar_t *b_val  = &valuesB[adjind * block_size];
+              if (marker[b_col_ind] == 0) {
+                marker[b_col_ind]              = 1;
+                myentries[current_col_index++] = b_col_ind;
+              }
+              kk_block_add_mul(block_dim, dense_accum + b_col_ind * block_size,
+                               a_val, b_val);
+            }
+          }
+          for (nnz_lno_t i = 0; i < current_col_index; ++i) {
+            nnz_lno_t ind = myentries[i];
+            scalar_t *acc = dense_accum + ind * block_size;
+            kk_block_set(block_dim, myvals + i * block_size, acc);
+            kk_block_init(block_dim, acc);
+            marker[ind] = 0;
+          }
+        });
+    memory_space.release_chunk(dense_accum);
+  }
+};
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename a_row_view_t__, typename a_nnz_view_t__,
+          typename a_scalar_view_t__, typename b_row_view_t__,
+          typename b_nnz_view_t__, typename b_scalar_view_t__,
+          typename c_row_view_t__, typename c_nnz_view_t__,
+          typename c_scalar_view_t__, typename c_nnz_tmp_view_t>
+
+struct KokkosBSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                     a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                     b_scalar_nnz_view_t_>::NumericCMEM {
+  static constexpr auto scalarAlignPad =
+      KokkosBSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::scalarAlignPad;
+
+  nnz_lno_t numrows;
+  nnz_lno_t block_dim;
+  nnz_lno_t block_size;
+
+  a_row_view_t__ row_mapA;
+  a_nnz_view_t__ entriesA;
+  a_scalar_view_t__ valuesA;
+
+  b_row_view_t__ row_mapB;
+  b_nnz_view_t__ entriesB;
+  b_scalar_view_t__ valuesB;
+
+  c_row_view_t__ rowmapC;
+  c_nnz_view_t__ entriesC;
+  c_scalar_view_t__ valuesC;
+
+  c_nnz_tmp_view_t beginsC;
+  c_nnz_tmp_view_t nextsC;
+
+  nnz_lno_t *pbeginsC, *pnextsC, *pEntriesC;
+  scalar_t *pvaluesC;
+
+  const size_t shared_memory_size;
+  const int vector_size;
+  const nnz_lno_t team_work_size;
+
+  const int unit_memory;  // begins, nexts, and keys. No need for vals yet.
+  const int suggested_team_size;
+  const int thread_memory;
+  nnz_lno_t shmem_key_size;
+  nnz_lno_t shared_memory_hash_func;
+  nnz_lno_t shmem_hash_size;
+
+  NumericCMEM(nnz_lno_t m_, nnz_lno_t block_dim_, a_row_view_t__ row_mapA_,
+              a_nnz_view_t__ entriesA_, a_scalar_view_t__ valuesA_,
+
+              b_row_view_t__ row_mapB_, b_nnz_view_t__ entriesB_,
+              b_scalar_view_t__ valuesB_,
+
+              c_row_view_t__ rowmapC_, c_nnz_view_t__ entriesC_,
+              c_scalar_view_t__ valuesC_,
+
+              c_nnz_tmp_view_t beginsC_, c_nnz_tmp_view_t nextsC_,
+
+              const size_type sharedMemorySize_,
+              const int suggested_vector_size,
+              const nnz_lno_t team_row_chunk_size, int suggested_team_size_,
+              bool KOKKOSKERNELS_VERBOSE_)
+      : numrows(m_),
+        block_dim(block_dim_),
+        block_size(block_dim_ * block_dim_),
+
+        row_mapA(row_mapA_),
+        entriesA(entriesA_),
+        valuesA(valuesA_),
+
+        row_mapB(row_mapB_),
+        entriesB(entriesB_),
+        valuesB(valuesB_),
+
+        rowmapC(rowmapC_),
+        entriesC(entriesC_),
+        valuesC(valuesC_),
+        beginsC(beginsC_),
+        nextsC(nextsC_),
+        pbeginsC(beginsC_.data()),
+        pnextsC(nextsC_.data()),
+        pEntriesC(entriesC_.data()),
+        pvaluesC(valuesC_.data()),
+        shared_memory_size(sharedMemorySize_),
+
+        vector_size(suggested_vector_size),
+        team_work_size(team_row_chunk_size),
+
+        unit_memory(sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) +
+                    sizeof(scalar_t) * block_size),
+        suggested_team_size(suggested_team_size_),
+        thread_memory((shared_memory_size / 8 / suggested_team_size_) * 8),
+        shmem_key_size(),
+        shared_memory_hash_func(),
+        shmem_hash_size(1) {
+    shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 2 - scalarAlignPad) /
+                      unit_memory);
+    if (KOKKOSKERNELS_VERBOSE_) {
+      std::cout << "\t\tNumericCMEM -- thread_memory:" << thread_memory
+                << " unit_memory:" << unit_memory
+                << " initial key size:" << shmem_key_size << std::endl;
+    }
+    while (shmem_hash_size * 2 <= shmem_key_size) {
+      shmem_hash_size = shmem_hash_size * 2;
+    }
+    shared_memory_hash_func = shmem_hash_size - 1;
+    shmem_key_size          = shmem_key_size +
+                     ((shmem_key_size - shmem_hash_size) * sizeof(nnz_lno_t)) /
+                         (unit_memory - sizeof(nnz_lno_t));
+    shmem_key_size = (shmem_key_size >> 1) << 1;
+
+    if (KOKKOSKERNELS_VERBOSE_) {
+      std::cout << "\t\tNumericCMEM -- adjusted hashsize:" << shmem_hash_size
+                << " shmem_key_size:" << shmem_key_size << std::endl;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const GPUTag &, const team_member_t &teamMember) const {
+    // get the beginning and end rows of the team.
+    nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
+
+    char *all_shared_memory =
+        (char *)(teamMember.team_shmem().get_shmem(shared_memory_size));
+
+    // shift it to the thread private part
+    all_shared_memory += thread_memory * teamMember.team_rank();
+
+    // used_hash_sizes hold the size of 1st and 2nd level hashes
+    volatile nnz_lno_t *used_hash_sizes =
+        (volatile nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * 2;
+
+    nnz_lno_t *begins = (nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * shmem_hash_size;
+
+    // poins to the next elements
+    nnz_lno_t *nexts = (nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size;
+
+    // holds the keys
+    nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size;
+    scalar_t *vals =
+        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+
+    KokkosKernels::Experimental::BlockHashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, scalar_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm(block_dim, shmem_key_size, shared_memory_hash_func, begins, nexts,
+           keys, vals);
+
+    // issue-508, TODO: understand and re-work below parallel_for loop.
+    // Inialize hm2 with correct max_value_size and hashOpRHS
+    // global_memory_hash_size is computed, per team of threads -- this is
+    // hashOpRHS.
+
+    KokkosKernels::Experimental::BlockHashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, scalar_t,
+        KokkosKernels::Experimental::HashOpType::modulo>
+        hm2(block_dim, 0, 0, NULL, NULL, NULL, NULL);
+    /*
+    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,scalar_t>
+    hm2(global_memory_hash_size, global_memory_hash_size,
+        pbeginsC + c_row_begin, pnextsC + c_row_begin, pEntriesC + c_row_begin,
+    pvaluesC + c_row_begin);
+        */
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          const size_type c_row_begin = rowmapC[row_index];
+          const nnz_lno_t global_memory_hash_size =
+              nnz_lno_t(rowmapC[row_index + 1] - c_row_begin);
+
+          hm2.keys        = pEntriesC + c_row_begin;
+          hm2.values      = pvaluesC + c_row_begin * block_size;
+          hm2.hash_begins = pbeginsC + c_row_begin;
+          hm2.hash_nexts  = pnextsC + c_row_begin;
+
+          // initialize begins.
+          Kokkos::parallel_for(
+              Kokkos::ThreadVectorRange(teamMember, shmem_hash_size),
+              [&](int i) { begins[i] = -1; });
+
+          // initialize hash usage sizes
+          Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+            used_hash_sizes[0] = 0;
+            used_hash_sizes[1] = 0;
+          });
+
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t left_work =
+              nnz_lno_t(row_mapA[row_index + 1] - col_begin);
+
+          for (nnz_lno_t colind = 0; colind < left_work; ++colind) {
+            size_type a_col       = colind + col_begin;
+            nnz_lno_t rowB        = entriesA[a_col];
+            const scalar_t *a_val = &valuesA[a_col * block_size];
+
+            size_type rowBegin   = row_mapB(rowB);
+            nnz_lno_t left_work_ = row_mapB(rowB + 1) - rowBegin;
+
+            while (left_work_) {
+              nnz_lno_t work_to_handle =
+                  KOKKOSKERNELS_MACRO_MIN(vector_size, left_work_);
+              nnz_lno_t b_col_ind   = -1;
+              const scalar_t *b_val = nullptr;
+              Kokkos::parallel_for(
+                  Kokkos::ThreadVectorRange(teamMember, work_to_handle),
+                  [&](nnz_lno_t i) {
+                    const size_type adjind = i + rowBegin;
+                    b_col_ind              = entriesB[adjind];
+                    b_val                  = &valuesB[adjind * block_size];
+                  });
+
+              int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeAdd(
+                  b_col_ind, a_val, b_val, used_hash_sizes);
+
+              int overall_num_unsuccess = 0;
+
+              Kokkos::parallel_reduce(
+                  Kokkos::ThreadVectorRange(teamMember, vector_size),
+                  [&](const int /* threadid */, int &overall_num_unsuccess_) {
+                    overall_num_unsuccess_ += num_unsuccess;
+                  },
+                  overall_num_unsuccess);
+
+              if (overall_num_unsuccess) {
+                nnz_lno_t hash_ = -1;
+                if (num_unsuccess) {
+                  hash_ = b_col_ind % global_memory_hash_size;
+                }
+
+                // int insertion =
+                hm2.vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length(
+                    teamMember, vector_size, hash_, b_col_ind, a_val, b_val,
+                    used_hash_sizes + 1, global_memory_hash_size);
+              }
+              left_work_ -= work_to_handle;
+              rowBegin += work_to_handle;
+            }
+          }
+
+          Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+            if (used_hash_sizes[0] > shmem_key_size)
+              used_hash_sizes[0] = shmem_key_size;
+          });
+
+          size_type num_elements = used_hash_sizes[0];
+
+          size_type written_index = used_hash_sizes[1];
+          Kokkos::parallel_for(
+              Kokkos::ThreadVectorRange(teamMember, num_elements),
+              [&](size_type i) {
+                const auto idx = c_row_begin + written_index + i;
+                pEntriesC[idx] = keys[i];
+                kk_block_set(block_dim, pvaluesC + idx * block_size,
+                             vals + i * block_size);
+              });
+        });
+  }
+
+  size_t team_shmem_size(int /* team_size */) const {
+    return shared_memory_size;
+  }
+};
+
+//
+// * Notes on KokkosBSPGEMM_numeric_speed *
+//
+// Prior to this routine, KokkosBSPGEMM_numeric(...) was called
+//
+//   KokkosBSPGEMM_numeric(...) :
+//     if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP ==
+//     this->spgemm_algorithm) :
+//       call KokkosBSPGEMM_numeric_speed(...)
+//     else:
+//       call  KokkosBSPGEMM_numeric_hash(...)
+//
+//
+// KokkosBSPGEMM_numeric_speed:
+//
+// Algorithm selection as follows and matching to kernel Tag:
+//
+//  Policy typedefs with tags found in: KokkosSparse_spgemm_impl.hpp
+//
+//  if GPU:
+//    "KokkosSparse::NumericCMEM::KKSPEED::GPU" : gpu_team_policy_t,  i.e.
+//    GPUTag
+//
+//  else :
+//    "KokkosSparse::NumericCMEM_CPU::DENSE::DYNAMIC" :
+//    dynamic_multicore_team_policy_t,  i.e. MultiCoreTag
+//    "KokkosSparse::NumericCMEM_CPU::DENSE::STATIC" :  multicore_team_policy_t,
+//    i.e. MultiCoreTag
+//
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename c_row_view_t, typename c_lno_nnz_view_t,
+          typename c_scalar_nnz_view_t>
+void KokkosBSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                   a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                   b_scalar_nnz_view_t_>::
+    KokkosBSPGEMM_numeric_speed(
+        c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_,
+        c_scalar_nnz_view_t valuesC_,
+        KokkosKernels::Impl::ExecSpaceType my_exec_space_) {
+  if (Base::KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tSPEED MODE" << std::endl;
+  }
+
+  nnz_lno_t brows = this->row_mapB.extent(0) - 1;
+  size_type bnnz  = this->valsB.extent(0);
+
+  // get suggested vector size, teamsize and row chunk size.
+  int suggested_vector_size =
+      this->handle->get_suggested_vector_size(brows, bnnz);
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
+  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
+      suggested_team_size, this->concurrency, Base::a_row_cnt);
+
+  Kokkos::Timer numeric_speed_timer_with_free;
+
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<
+          typename HandleType::HandleExecSpace>()) {
+    // allocate memory for begins and next to be used by the hashmap
+    nnz_lno_temp_work_view_t beginsC(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "C keys"),
+        valuesC_.extent(0));
+    nnz_lno_temp_work_view_t nextsC(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "C nexts"),
+        valuesC_.extent(0));
+    Kokkos::deep_copy(beginsC, -1);
+
+    // create the functor.
+    NumericCMEM<const_a_lno_row_view_t, const_a_lno_nnz_view_t,
+                const_a_scalar_nnz_view_t, const_b_lno_row_view_t,
+                const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t, c_row_view_t,
+                c_lno_nnz_view_t, c_scalar_nnz_view_t, nnz_lno_temp_work_view_t>
+        sc(Base::a_row_cnt, block_dim, this->row_mapA, this->entriesA,
+           this->valsA, this->row_mapB, this->entriesB, this->valsB,
+
+           rowmapC_, entriesC_, valuesC_,
+
+           beginsC, nextsC, this->shmem_size, suggested_vector_size,
+           team_row_chunk_size, suggested_team_size,
+           Base::KOKKOSKERNELS_VERBOSE);
+
+    Kokkos::Timer timer1;
+    MyExecSpace().fence();
+
+    if (Base::KOKKOSKERNELS_VERBOSE) {
+      std::cout << "\t\tGPU vector_size:" << suggested_vector_size
+                << " team_size:" << suggested_team_size
+                << " chunk_size:" << team_row_chunk_size << std::endl;
+    }
+
+    timer1.reset();
+    // this is basically kkmem without memory pools.
+    // only executed for to check the effect of memory pools.
+    Kokkos::parallel_for(
+        "KokkosSparse::NumericCMEM::KKSPEED::GPU",
+        gpu_team_policy_t(Base::a_row_cnt / team_row_chunk_size + 1,
+                          suggested_team_size, suggested_vector_size),
+        sc);
+    MyExecSpace().fence();
+
+    if (Base::KOKKOSKERNELS_VERBOSE) {
+      std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl;
+    }
+  } else {
+    Kokkos::Timer numeric_speed_timer;
+    typedef KokkosKernels::Impl::UniformMemoryPool<MyTempMemorySpace, scalar_t>
+        pool_memory_space;
+
+    KokkosKernels::Impl::PoolType my_pool_type =
+        KokkosKernels::Impl::OneThread2OneChunk;
+    int num_chunks = this->concurrency;
+
+    Kokkos::Timer timer1;
+    const size_t chunk_size = this->b_col_cnt * block_dim * block_dim +
+                              this->b_col_cnt / sizeof(scalar_t) + 1;
+    pool_memory_space m_space(num_chunks, chunk_size, 0, my_pool_type);
+    MyExecSpace().fence();
+
+    if (Base::KOKKOSKERNELS_VERBOSE) {
+      std::cout << "\t\tPool Alloc Time:" << timer1.seconds() << std::endl;
+      std::cout << "\tPool Size(MB):"
+                << sizeof(scalar_t) * (num_chunks * chunk_size) / 1024. / 1024.
+                << std::endl;
+    }
+
+    NumericCMEM_CPU<const_a_lno_row_view_t, const_a_lno_nnz_view_t,
+                    const_a_scalar_nnz_view_t, const_b_lno_row_view_t,
+                    const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t,
+                    c_row_view_t, c_lno_nnz_view_t, c_scalar_nnz_view_t,
+                    pool_memory_space>
+        sc(Base::a_row_cnt, this->b_col_cnt, block_dim, this->row_mapA,
+           this->entriesA, this->valsA, this->row_mapB, this->entriesB,
+           this->valsB,
+
+           rowmapC_, entriesC_, valuesC_, m_space, my_exec_space_,
+           team_row_chunk_size);
+
+    MyExecSpace().fence();
+    if (Base::KOKKOSKERNELS_VERBOSE) {
+      std::cout << "\t\tCPU vector_size:" << suggested_vector_size
+                << " team_size:" << suggested_team_size
+                << " chunk_size:" << team_row_chunk_size << std::endl;
+    }
+    timer1.reset();
+
+    if (this->use_dynamic_schedule) {
+      Kokkos::parallel_for("KokkosSparse::NumericCMEM_CPU::DENSE::DYNAMIC",
+                           dynamic_multicore_team_policy_t(
+                               Base::a_row_cnt / team_row_chunk_size + 1,
+                               suggested_team_size, suggested_vector_size),
+                           sc);
+    } else {
+      Kokkos::parallel_for(
+          "KokkosSparse::NumericCMEM_CPU::DENSE::STATIC",
+          multicore_team_policy_t(Base::a_row_cnt / team_row_chunk_size + 1,
+                                  suggested_team_size, suggested_vector_size),
+          sc);
+    }
+
+    MyExecSpace().fence();
+
+    if (Base::KOKKOSKERNELS_VERBOSE) {
+      std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl;
+      std::cout << "\t\tNumeric SPEED TIME:" << numeric_speed_timer.seconds()
+                << std::endl;
+    }
+  }
+  if (Base::KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tNumeric SPEED TIME WITH FREE:"
+              << numeric_speed_timer_with_free.seconds() << std::endl;
+  }
+}
+}  // namespace Impl
+}  // namespace KokkosSparse
diff --git a/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp
new file mode 100644
index 0000000000..d87c49bd55
--- /dev/null
+++ b/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp
@@ -0,0 +1,407 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOSSPARSE_IMPL_BSPGEMM_NUMERIC_SPEC_HPP_
+#define KOKKOSSPARSE_IMPL_BSPGEMM_NUMERIC_SPEC_HPP_
+
+#include <KokkosKernels_config.h>
+
+#include <Kokkos_Core.hpp>
+//#include <Kokkos_ArithTraits.hpp>
+#include "KokkosKernels_Handle.hpp"
+// Include the actual functors
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+//#include "KokkosSparse_spgemm_symbolic.hpp"
+#include "KokkosSparse_spgemm_cuSPARSE_impl.hpp"
+#include "KokkosSparse_spgemm_CUSP_impl.hpp"
+#include "KokkosSparse_bspgemm_impl.hpp"
+#include "KokkosSparse_bspgemm_impl_seq.hpp"
+#include "KokkosSparse_spgemm_mkl_impl.hpp"
+#include "KokkosSparse_spgemm_mkl2phase_impl.hpp"
+#include "KokkosSparse_spgemm_viennaCL_impl.hpp"
+#endif
+
+namespace KokkosSparse {
+namespace Impl {
+// Specialization struct which defines whether a specialization exists
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class a_scalar_view_t, class b_size_view_t_, class b_lno_view_t,
+          class b_scalar_view_t, class c_size_view_t_, class c_lno_view_t,
+          class c_scalar_view_t>
+struct bspgemm_numeric_eti_spec_avail {
+  enum : bool { value = false };
+};
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_AVAIL(                      \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE)                             \
+  template <>                                                             \
+  struct bspgemm_numeric_eti_spec_avail<                                  \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,     \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {          \
+    enum : bool { value = true };                                         \
+  };                                                                      \
+                                                                          \
+  template <>                                                             \
+  struct bspgemm_numeric_eti_spec_avail<                                  \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,     \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {          \
+    enum : bool { value = true };                                         \
+  };
+
+// Include the actual specialization declarations
+//#include <KokkosSparse_bspgemm_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_avail.hpp>
+
+namespace KokkosSparse {
+namespace Impl {
+
+// For future use (when TPL with block SpGEMM numeric phase is encountered)
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class a_scalar_view_t, class b_size_view_t_, class b_lno_view_t,
+          class b_scalar_view_t, class c_size_view_t_, class c_lno_view_t,
+          class c_scalar_view_t>
+struct bspgemm_numeric_tpl_spec_avail {
+  enum : bool { value = false };
+};
+
+// Unification layer
+/// \brief Implementation of BSR sparse block matrix - matrix multiplication
+
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class a_scalar_view_t, class b_size_view_t_, class b_lno_view_t,
+          class b_scalar_view_t, class c_size_view_t_, class c_lno_view_t,
+          class c_scalar_view_t,
+          bool tpl_spec_avail = bspgemm_numeric_tpl_spec_avail<
+              KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t,
+              b_size_view_t_, b_lno_view_t, b_scalar_view_t, c_size_view_t_,
+              c_lno_view_t, c_scalar_view_t>::value,
+          bool eti_spec_avail = bspgemm_numeric_eti_spec_avail<
+              KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t,
+              b_size_view_t_, b_lno_view_t, b_scalar_view_t, c_size_view_t_,
+              c_lno_view_t, c_scalar_view_t>::value>
+struct BSPGEMM_NUMERIC {
+  static void bspgemm_numeric(KernelHandle *handle,
+                              typename KernelHandle::const_nnz_lno_t m,
+                              typename KernelHandle::const_nnz_lno_t n,
+                              typename KernelHandle::const_nnz_lno_t k,
+                              typename KernelHandle::const_nnz_lno_t blockDim,
+                              a_size_view_t_ row_mapA, a_lno_view_t entriesA,
+                              a_scalar_view_t valuesA,
+
+                              bool transposeA, b_size_view_t_ row_mapB,
+                              b_lno_view_t entriesB, b_scalar_view_t valuesB,
+                              bool transposeB, c_size_view_t_ row_mapC,
+                              c_lno_view_t &entriesC, c_scalar_view_t &valuesC);
+};
+
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+
+//! Full specialization of block spgemm
+// Unification layer
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class a_scalar_view_t, class b_size_view_t_, class b_lno_view_t,
+          class b_scalar_view_t, class c_size_view_t_, class c_lno_view_t,
+          class c_scalar_view_t>
+struct BSPGEMM_NUMERIC<
+    KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t, b_size_view_t_,
+    b_lno_view_t, b_scalar_view_t, c_size_view_t_, c_lno_view_t,
+    c_scalar_view_t, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  static void bspgemm_numeric(
+      KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
+      typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
+      typename KernelHandle::const_nnz_lno_t blockDim, a_size_view_t_ row_mapA,
+      a_lno_view_t entriesA, a_scalar_view_t valuesA,
+
+      bool transposeA, b_size_view_t_ row_mapB, b_lno_view_t entriesB,
+      b_scalar_view_t valuesB, bool transposeB, c_size_view_t_ row_mapC,
+      c_lno_view_t &entriesC, c_scalar_view_t &valuesC) {
+    typedef typename KernelHandle::SPGEMMHandleType spgemmHandleType;
+    spgemmHandleType *sh = handle->get_spgemm_handle();
+    if (!sh->is_symbolic_called()) {
+      throw std::runtime_error(
+          "Call spgemm symbolic before calling SpGEMM numeric");
+    }
+
+    switch (sh->get_algorithm_type()) {
+      case SPGEMM_CUSPARSE:
+        throw std::runtime_error(
+            "cuSPARSE implementation for block SpGEMM is not available");
+      case SPGEMM_CUSP:
+        throw std::runtime_error(
+            "CUSP implementation for block SpGEMM is not available");
+      case SPGEMM_MKL:
+      case SPGEMM_MKL2PHASE:
+        throw std::runtime_error(
+            "MKL implementation available for block SpGEMM is not available");
+      case SPGEMM_VIENNA:
+        throw std::runtime_error(
+            "Vienna implementation available for block SpGEMM is not "
+            "available");
+
+      default:
+
+      {
+        KokkosBSPGEMM<KernelHandle, a_size_view_t_, a_lno_view_t,
+                      a_scalar_view_t, b_size_view_t_, b_lno_view_t,
+                      b_scalar_view_t>
+            kbspgemm(handle, m, n, k, blockDim, row_mapA, entriesA, valuesA,
+                     transposeA, row_mapB, entriesB, valuesB, transposeB);
+        kbspgemm.KokkosBSPGEMM_numeric(row_mapC, entriesC, valuesC);
+      } break;
+      case SPGEMM_SERIAL:
+      case SPGEMM_DEBUG:
+        bspgemm_debug_numeric(handle, m, n, k, blockDim, row_mapA, entriesA,
+                              valuesA, transposeA, row_mapB, entriesB, valuesB,
+                              transposeB, row_mapC, entriesC, valuesC);
+        break;
+    }
+  }
+};
+
+#endif
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_DECL(                       \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE)                             \
+  extern template struct BSPGEMM_NUMERIC<                                 \
+      typename KokkosKernels::Experimental::KokkosKernelsHandle<          \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,     \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;                                                       \
+                                                                          \
+  extern template struct BSPGEMM_NUMERIC<                                 \
+      typename KokkosKernels::Experimental::KokkosKernelsHandle<          \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,     \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;
+
+#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_INST(                       \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE)                             \
+  template struct BSPGEMM_NUMERIC<                                        \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,     \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;                                                       \
+                                                                          \
+  template struct BSPGEMM_NUMERIC<                                        \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,     \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;
+
+//#include <KokkosSparse_spgemm_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp>
+
+#endif  // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_
diff --git a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp
index 60a00bd36a..bb95eea101 100644
--- a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp
@@ -337,9 +337,13 @@ class ClusterGaussSeidel {
                            (teamMember.league_rank() * _clusters_per_team) +
                            work;
             if (ii >= _color_set_end) return;
-            nnz_lno_t cluster = _color_adj(ii);
-            for (nnz_lno_t j = _cluster_offsets(cluster);
-                 j < _cluster_offsets(cluster + 1); j++) {
+            nnz_lno_t cluster      = _color_adj(ii);
+            nnz_lno_t clusterBegin = _cluster_offsets(cluster);
+            nnz_lno_t clusterEnd   = _cluster_offsets(cluster + 1);
+            for (nnz_lno_t jcount = 0; jcount < clusterEnd - clusterBegin;
+                 jcount++) {
+              nnz_lno_t j = _is_backward ? (clusterEnd - 1 - jcount)
+                                         : clusterBegin + jcount;
               nnz_lno_t row      = _cluster_verts(j);
               nnz_lno_t num_vecs = _Xvector.extent(1);
               for (nnz_lno_t batch_start = 0; batch_start < num_vecs;) {
@@ -352,14 +356,10 @@ class ClusterGaussSeidel {
                   COL_BATCH_CASE(1)
                   COL_BATCH_CASE(2)
                   COL_BATCH_CASE(3)
-                  COL_BATCH_CASE(4)
-                  COL_BATCH_CASE(5)
-                  COL_BATCH_CASE(6)
-                  COL_BATCH_CASE(7)
 #undef COL_BATCH_CASE
                   default:
-                    runColBatch<8>(teamMember, row, batch_start);
-                    batch_start += 8;
+                    runColBatch<4>(teamMember, row, batch_start);
+                    batch_start += 4;
                 }
               }
             }
@@ -561,6 +561,7 @@ class ClusterGaussSeidel {
           in_rowmap_t, in_colinds_t, rowmap_t, colinds_t, MyExecSpace>(
           num_rows, this->row_map, this->entries, sym_xadj, sym_adj);
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
+      MyExecSpace().fence();
       std::cout << "SYMMETRIZING TIME: " << timer.seconds() << std::endl;
       timer.reset();
 #endif
@@ -607,6 +608,7 @@ class ClusterGaussSeidel {
                                  " is not implemented");
     }
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
+    MyExecSpace().fence();
     std::cout << "Graph clustering: " << timer.seconds() << '\n';
     timer.reset();
 #endif
@@ -620,6 +622,7 @@ class ClusterGaussSeidel {
         raw_sym_xadj, raw_sym_adj, vertClusters, numClusters, clusterRowmap,
         clusterEntries, clusterOffsets, clusterVerts, false);
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
+    MyExecSpace().fence();
     std::cout << "Building explicit cluster graph: " << timer.seconds() << '\n';
     timer.reset();
 #endif
@@ -668,6 +671,7 @@ class ClusterGaussSeidel {
     kh.destroy_graph_coloring_handle();
 #endif
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
+    MyExecSpace().fence();
     std::cout << "Coloring: " << timer.seconds() << '\n';
     timer.reset();
 #endif
@@ -677,8 +681,8 @@ class ClusterGaussSeidel {
         typename HandleType::GraphColoringHandleType::color_view_t,
         nnz_lno_persistent_work_view_t, MyExecSpace>(
         numClusters, numColors, colors, color_xadj, color_adj);
-    MyExecSpace().fence();
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
+    MyExecSpace().fence();
     std::cout << "CREATE_REVERSE_MAP:" << timer.seconds() << std::endl;
     timer.reset();
 #endif
@@ -798,8 +802,8 @@ class ClusterGaussSeidel {
     }
     gsHandle->set_inverse_diagonal(inverse_diagonal);
     gsHandle->set_call_numeric(true);
-    MyExecSpace().fence();
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
+    MyExecSpace().fence();
     std::cout << "NUMERIC:" << timer.seconds() << std::endl;
 #endif
   }
@@ -861,7 +865,6 @@ class ClusterGaussSeidel {
       this->IterativePSGS(gs, numColors, h_color_xadj, numIter, apply_forward,
                           apply_backward);
     }
-    MyExecSpace().fence();
   }
 
   template <typename TPSGS>
@@ -894,7 +897,6 @@ class ClusterGaussSeidel {
                               gs._clusters_per_team,
                           team_size, vec_size),
             gs);
-        MyExecSpace().fence();
       }
     }
     if (apply_backward) {
@@ -913,7 +915,6 @@ class ClusterGaussSeidel {
                                 gs._clusters_per_team,
                             team_size, vec_size),
               gs);
-          MyExecSpace().fence();
           if (i == 0) {
             break;
           }
@@ -945,7 +946,6 @@ class ClusterGaussSeidel {
                              Kokkos::RangePolicy<MyExecSpace, PSGS_ForwardTag>(
                                  0, color_index_end - color_index_begin),
                              gs);
-        MyExecSpace().fence();
       }
     }
     if (apply_backward && numColors) {
@@ -958,7 +958,6 @@ class ClusterGaussSeidel {
                              Kokkos::RangePolicy<MyExecSpace, PSGS_BackwardTag>(
                                  0, color_index_end - color_index_begin),
                              gs);
-        MyExecSpace().fence();
         if (i == 0) {
           break;
         }
diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
index 0f265dfbc4..137b75b3f7 100644
--- a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
@@ -46,13 +46,14 @@
 #define _KOKKOSGSIMP_HPP
 
 #include "KokkosKernels_Utils.hpp"
+#include "KokkosSparse_Utils.hpp"
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Bitset.hpp>
 #include "KokkosGraph_Distance1Color.hpp"
 #include "KokkosKernels_Uniform_Initialized_MemoryPool.hpp"
 #include "KokkosKernels_BitUtils.hpp"
 #include "KokkosKernels_SimpleUtils.hpp"
-#include "KokkosKernels_Sorting.hpp"
+#include "KokkosSparse_SortCrs.hpp"
 
 // FOR DEBUGGING
 #include "KokkosBlas1_nrm2.hpp"
@@ -62,7 +63,7 @@ namespace Impl {
 
 template <typename HandleType, typename lno_row_view_t_,
           typename lno_nnz_view_t_, typename scalar_nnz_view_t_,
-          KokkosKernels::SparseMatrixFormat format = KokkosKernels::CRS>
+          KokkosSparse::SparseMatrixFormat format = KokkosSparse::CRS>
 class PointGaussSeidel {
  public:
   typedef lno_row_view_t_ in_lno_row_view_t;
@@ -136,7 +137,7 @@ class PointGaussSeidel {
       pool_memory_space;
 
   typedef
-      typename KokkosKernels::Impl::MatrixRowIndex<format, nnz_lno_t, size_type>
+      typename KokkosSparse::Impl::MatrixRowIndex<format, nnz_lno_t, size_type>
           RowIndex;
 
  private:
@@ -979,8 +980,8 @@ class PointGaussSeidel {
       gsHandle->set_long_row_x(long_row_x);
     } else {
       // Just sort rows by ID.
-      KokkosKernels::sort_crs_graph<MyExecSpace, decltype(color_xadj),
-                                    decltype(color_adj)>(color_xadj, color_adj);
+      KokkosSparse::sort_crs_graph<MyExecSpace, decltype(color_xadj),
+                                   decltype(color_adj)>(color_xadj, color_adj);
     }
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
     MyExecSpace().fence();
@@ -1104,7 +1105,7 @@ class PointGaussSeidel {
           // std::cout << "level_2_mem:" << level_2_mem << std::endl;
 
           size_type num_large_rows = 0;
-          KokkosKernels::Impl::kk_reduce_numrows_larger_than_threshold<
+          KokkosSparse::Impl::kk_reduce_numrows_larger_than_threshold<
               row_lno_persistent_work_view_t, MyExecSpace>(
               brows, permuted_xadj, num_values_in_l1, num_large_rows);
           num_big_rows = KOKKOSKERNELS_MACRO_MIN(
diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp
index 182d33a2e7..5af78f96c5 100644
--- a/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp
@@ -161,7 +161,7 @@ struct GAUSS_SEIDEL_SYMBOLIC {
 };
 
 template <
-    class KernelHandle, KokkosKernels::SparseMatrixFormat format,
+    class KernelHandle, KokkosSparse::SparseMatrixFormat format,
     class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t,
     bool tpl_spec_avail = gauss_seidel_numeric_tpl_spec_avail<
         KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>::value,
@@ -180,7 +180,7 @@ struct GAUSS_SEIDEL_NUMERIC {
       a_scalar_view_t given_inverse_diagonal, bool is_graph_symmetric);
 };
 
-template <class KernelHandle, KokkosKernels::SparseMatrixFormat format,
+template <class KernelHandle, KokkosSparse::SparseMatrixFormat format,
           class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t,
           class x_scalar_view_t, class y_scalar_view_t,
           bool tpl_spec_avail = gauss_seidel_apply_tpl_spec_avail<
@@ -234,7 +234,7 @@ struct GAUSS_SEIDEL_SYMBOLIC<KernelHandle, a_size_view_t_, a_lno_view_t_, false,
   }
 };
 
-template <class KernelHandle, KokkosKernels::SparseMatrixFormat format,
+template <class KernelHandle, KokkosSparse::SparseMatrixFormat format,
           class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t>
 struct GAUSS_SEIDEL_NUMERIC<KernelHandle, format, a_size_view_t_, a_lno_view_t,
                             a_scalar_view_t, false,
@@ -301,7 +301,7 @@ struct GAUSS_SEIDEL_NUMERIC<KernelHandle, format, a_size_view_t_, a_lno_view_t,
   }
 };
 
-template <class KernelHandle, KokkosKernels::SparseMatrixFormat format,
+template <class KernelHandle, KokkosSparse::SparseMatrixFormat format,
           class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t,
           class x_scalar_view_t, class y_scalar_view_t>
 struct GAUSS_SEIDEL_APPLY<KernelHandle, format, a_size_view_t_, a_lno_view_t,
@@ -401,7 +401,7 @@ struct GAUSS_SEIDEL_APPLY<KernelHandle, format, a_size_view_t_, a_lno_view_t,
       KokkosKernels::Experimental::KokkosKernelsHandle<                   \
           const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
           EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE>,               \
-      KokkosKernels::BlockCRS,                                            \
+      KokkosSparse::BlockCRS,                                             \
       Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
                    Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
@@ -416,7 +416,7 @@ struct GAUSS_SEIDEL_APPLY<KernelHandle, format, a_size_view_t_, a_lno_view_t,
       KokkosKernels::Experimental::KokkosKernelsHandle<                   \
           const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
           EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE>,               \
-      KokkosKernels::BSR,                                                 \
+      KokkosSparse::BSR,                                                  \
       Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
                    Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
@@ -435,7 +435,7 @@ struct GAUSS_SEIDEL_APPLY<KernelHandle, format, a_size_view_t_, a_lno_view_t,
       KokkosKernels::Experimental::KokkosKernelsHandle<                   \
           const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
           EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE>,               \
-      KokkosKernels::BlockCRS,                                            \
+      KokkosSparse::BlockCRS,                                             \
       Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
                    Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
@@ -456,7 +456,7 @@ struct GAUSS_SEIDEL_APPLY<KernelHandle, format, a_size_view_t_, a_lno_view_t,
       KokkosKernels::Experimental::KokkosKernelsHandle<                   \
           const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
           EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE>,               \
-      KokkosKernels::BSR,                                                 \
+      KokkosSparse::BSR,                                                  \
       Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
                    Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
@@ -481,7 +481,7 @@ struct GAUSS_SEIDEL_APPLY<KernelHandle, format, a_size_view_t_, a_lno_view_t,
       KokkosKernels::Experimental::KokkosKernelsHandle<                   \
           const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
           EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE>,               \
-      KokkosKernels::BlockCRS,                                            \
+      KokkosSparse::BlockCRS,                                             \
       Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
                    Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
@@ -502,7 +502,7 @@ struct GAUSS_SEIDEL_APPLY<KernelHandle, format, a_size_view_t_, a_lno_view_t,
       KokkosKernels::Experimental::KokkosKernelsHandle<                   \
           const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
           EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE>,               \
-      KokkosKernels::BSR,                                                 \
+      KokkosSparse::BSR,                                                  \
       Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
                    Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
diff --git a/src/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp
new file mode 100644
index 0000000000..b3008ff716
--- /dev/null
+++ b/src/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp
@@ -0,0 +1,306 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Brian Kelley (bmkelle@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef _KOKKOS_SPADD_NUMERIC_IMPL_HPP
+#define _KOKKOS_SPADD_NUMERIC_IMPL_HPP
+
+#include "KokkosKernels_Handle.hpp"
+#include "KokkosKernels_Sorting.hpp"
+#include "Kokkos_ArithTraits.hpp"
+
+namespace KokkosSparse {
+namespace Impl {
+
+template <typename size_type, typename ordinal_type, typename ArowptrsT,
+          typename BrowptrsT, typename CrowptrsT, typename AcolindsT,
+          typename BcolindsT, typename CcolindsT, typename AvaluesT,
+          typename BvaluesT, typename CvaluesT, typename AscalarT,
+          typename BscalarT>
+struct SortedNumericSumFunctor {
+  using CscalarT = typename CvaluesT::non_const_value_type;
+
+  SortedNumericSumFunctor(const ArowptrsT& Arowptrs_,
+                          const BrowptrsT& Browptrs_,
+                          const CrowptrsT& Crowptrs_,
+                          const AcolindsT& Acolinds_,
+                          const BcolindsT& Bcolinds_,
+                          const CcolindsT& Ccolinds_, const AvaluesT& Avalues_,
+                          const BvaluesT& Bvalues_, const CvaluesT& Cvalues_,
+                          const AscalarT alpha_, const BscalarT beta_)
+      : Arowptrs(Arowptrs_),
+        Browptrs(Browptrs_),
+        Crowptrs(Crowptrs_),
+        Acolinds(Acolinds_),
+        Bcolinds(Bcolinds_),
+        Ccolinds(Ccolinds_),
+        Avalues(Avalues_),
+        Bvalues(Bvalues_),
+        Cvalues(Cvalues_),
+        alpha(alpha_),
+        beta(beta_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const {
+    const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits<ordinal_type>::max();
+
+    // count the union of nonzeros in Arow and Brow
+    size_type ai        = 0;
+    size_type bi        = 0;
+    size_type Arowstart = Arowptrs(i);
+    size_type Arowlen   = Arowptrs(i + 1) - Arowstart;
+    size_type Browstart = Browptrs(i);
+    size_type Browlen   = Browptrs(i + 1) - Browstart;
+    ordinal_type Acol   = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart);
+    ordinal_type Bcol   = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart);
+    size_type Coffset   = Crowptrs(i);
+    while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) {
+      ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol;
+      // Eat all entries in both A and B which have this column
+      // This also results in Acol/Bcol being updated to following entries for
+      // next loop iter
+      CscalarT accum = Kokkos::ArithTraits<CscalarT>::zero();
+      while (Acol == Ccol) {
+        accum += static_cast<CscalarT>(alpha * Avalues(Arowstart + ai));
+        ai++;
+        if (ai == Arowlen)
+          Acol = ORDINAL_MAX;
+        else
+          Acol = Acolinds(Arowstart + ai);
+      }
+      while (Bcol == Ccol) {
+        accum += static_cast<CscalarT>(beta * Bvalues(Browstart + bi));
+        bi++;
+        if (bi == Browlen)
+          Bcol = ORDINAL_MAX;
+        else
+          Bcol = Bcolinds(Browstart + bi);
+      }
+      Ccolinds(Coffset) = Ccol;
+      Cvalues(Coffset)  = accum;
+      Coffset++;
+    }
+  }
+
+  const ArowptrsT Arowptrs;
+  const BrowptrsT Browptrs;
+  const CrowptrsT Crowptrs;
+  const AcolindsT Acolinds;
+  const BcolindsT Bcolinds;
+  CcolindsT Ccolinds;
+  const AvaluesT Avalues;
+  const BvaluesT Bvalues;
+  CvaluesT Cvalues;
+  const AscalarT alpha;
+  const BscalarT beta;
+};
+
+template <typename size_type, typename ordinal_type, typename ArowptrsT,
+          typename BrowptrsT, typename CrowptrsT, typename AcolindsT,
+          typename BcolindsT, typename CcolindsT, typename AvaluesT,
+          typename BvaluesT, typename CvaluesT, typename AscalarT,
+          typename BscalarT>
+struct UnsortedNumericSumFunctor {
+  using CscalarT = typename CvaluesT::non_const_value_type;
+
+  UnsortedNumericSumFunctor(
+      const ArowptrsT Arowptrs_, const BrowptrsT Browptrs_,
+      const CrowptrsT Crowptrs_, const AcolindsT Acolinds_,
+      const BcolindsT Bcolinds_, CcolindsT Ccolinds_, const AvaluesT Avalues_,
+      const BvaluesT Bvalues_, CvaluesT Cvalues_, const AscalarT alpha_,
+      const BscalarT beta_, const CcolindsT Apos_, const CcolindsT Bpos_)
+      : Arowptrs(Arowptrs_),
+        Browptrs(Browptrs_),
+        Crowptrs(Crowptrs_),
+        Acolinds(Acolinds_),
+        Bcolinds(Bcolinds_),
+        Ccolinds(Ccolinds_),
+        Avalues(Avalues_),
+        Bvalues(Bvalues_),
+        Cvalues(Cvalues_),
+        alpha(alpha_),
+        beta(beta_),
+        Apos(Apos_),
+        Bpos(Bpos_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const {
+    size_type CrowStart = Crowptrs(i);
+    size_type CrowEnd   = Crowptrs(i + 1);
+    size_type ArowStart = Arowptrs(i);
+    size_type ArowEnd   = Arowptrs(i + 1);
+    size_type BrowStart = Browptrs(i);
+    size_type BrowEnd   = Browptrs(i + 1);
+    for (size_type j = CrowStart; j < CrowEnd; j++)
+      Cvalues(j) = Kokkos::ArithTraits<CscalarT>::zero();
+    // add in A entries, while setting C colinds
+    for (size_type j = ArowStart; j < ArowEnd; j++) {
+      Cvalues(CrowStart + Apos(j)) += alpha * Avalues(j);
+      Ccolinds(CrowStart + Apos(j)) = Acolinds(j);
+    }
+    // add in B entries, while setting C colinds
+    for (size_type j = BrowStart; j < BrowEnd; j++) {
+      Cvalues(CrowStart + Bpos(j)) += beta * Bvalues(j);
+      Ccolinds(CrowStart + Bpos(j)) = Bcolinds(j);
+    }
+  }
+  const ArowptrsT Arowptrs;
+  const BrowptrsT Browptrs;
+  const CrowptrsT Crowptrs;
+  const AcolindsT Acolinds;
+  const BcolindsT Bcolinds;
+  CcolindsT Ccolinds;
+  const AvaluesT Avalues;
+  const BvaluesT Bvalues;
+  CvaluesT Cvalues;
+  const AscalarT alpha;
+  const BscalarT beta;
+  const CcolindsT Apos;
+  const CcolindsT Bpos;
+};
+
+// Helper macro to check that two types are the same (ignoring const)
+#define SAME_TYPE(A, B)                             \
+  std::is_same<typename std::remove_const<A>::type, \
+               typename std::remove_const<B>::type>::value
+
+template <typename KernelHandle, typename alno_row_view_t,
+          typename alno_nnz_view_t, typename ascalar_t,
+          typename ascalar_nnz_view_t, typename blno_row_view_t,
+          typename blno_nnz_view_t, typename bscalar_t,
+          typename bscalar_nnz_view_t, typename clno_row_view_t,
+          typename clno_nnz_view_t, typename cscalar_nnz_view_t>
+void spadd_numeric_impl(
+    KernelHandle* kernel_handle, const alno_row_view_t a_rowmap,
+    const alno_nnz_view_t a_entries, const ascalar_nnz_view_t a_values,
+    const ascalar_t alpha, const blno_row_view_t b_rowmap,
+    const blno_nnz_view_t b_entries, const bscalar_nnz_view_t b_values,
+    const bscalar_t beta, const clno_row_view_t c_rowmap,
+    clno_nnz_view_t c_entries, cscalar_nnz_view_t c_values) {
+  typedef typename KernelHandle::size_type size_type;
+  typedef typename KernelHandle::nnz_lno_t ordinal_type;
+  typedef typename KernelHandle::nnz_scalar_t scalar_type;
+  typedef
+      typename KernelHandle::SPADDHandleType::execution_space execution_space;
+  // Check that A/B/C data types match KernelHandle types, and that C data types
+  // are nonconst (doesn't matter if A/B types are const)
+  static_assert(SAME_TYPE(ascalar_t, scalar_type),
+                "A scalar type must match handle scalar type");
+  static_assert(SAME_TYPE(bscalar_t, scalar_type),
+                "B scalar type must match handle scalar type");
+  static_assert(SAME_TYPE(typename alno_row_view_t::value_type, size_type),
+                "add_symbolic: A size_type must match KernelHandle size_type "
+                "(const doesn't matter)");
+  static_assert(SAME_TYPE(typename blno_row_view_t::value_type, size_type),
+                "add_symbolic: B size_type must match KernelHandle size_type "
+                "(const doesn't matter)");
+  static_assert(
+      SAME_TYPE(typename clno_row_view_t::non_const_value_type, size_type),
+      "add_symbolic: C size_type must match KernelHandle size_type)");
+  static_assert(SAME_TYPE(typename alno_nnz_view_t::value_type, ordinal_type),
+                "add_symbolic: A entry type must match KernelHandle entry type "
+                "(aka nnz_lno_t, and const doesn't matter)");
+  static_assert(SAME_TYPE(typename blno_nnz_view_t::value_type, ordinal_type),
+                "add_symbolic: B entry type must match KernelHandle entry type "
+                "(aka nnz_lno_t, and const doesn't matter)");
+  static_assert(SAME_TYPE(typename clno_nnz_view_t::value_type, ordinal_type),
+                "add_symbolic: C entry type must match KernelHandle entry type "
+                "(aka nnz_lno_t)");
+  static_assert(std::is_same<typename clno_nnz_view_t::non_const_value_type,
+                             typename clno_nnz_view_t::value_type>::value,
+                "add_symbolic: C entry type must not be const");
+  static_assert(
+      SAME_TYPE(typename ascalar_nnz_view_t::value_type, scalar_type),
+      "add_symbolic: A scalar type must match KernelHandle entry type (aka "
+      "nnz_lno_t, and const doesn't matter)");
+  static_assert(
+      SAME_TYPE(typename bscalar_nnz_view_t::value_type, scalar_type),
+      "add_symbolic: B scalar type must match KernelHandle entry type (aka "
+      "nnz_lno_t, and const doesn't matter)");
+  static_assert(
+      SAME_TYPE(typename cscalar_nnz_view_t::value_type, scalar_type),
+      "add_symbolic: C scalar type must match KernelHandle entry type (aka "
+      "nnz_lno_t)");
+  static_assert(std::is_same<typename cscalar_nnz_view_t::non_const_value_type,
+                             typename cscalar_nnz_view_t::value_type>::value,
+                "add_symbolic: C scalar type must not be const");
+  typedef Kokkos::RangePolicy<execution_space, size_type> range_type;
+  auto addHandle = kernel_handle->get_spadd_handle();
+  // rowmap length can be 0 or 1 if #rows is 0.
+  // Otherwise, it's always #rows+1.
+  if (a_rowmap.extent(0) == 0 || a_rowmap.extent(0) == 1) {
+    addHandle->set_call_numeric();
+    return;
+  }
+  ordinal_type nrows = a_rowmap.extent(0) - 1;
+  if (addHandle->is_input_sorted()) {
+    SortedNumericSumFunctor<size_type, ordinal_type, alno_row_view_t,
+                            blno_row_view_t, clno_row_view_t, alno_nnz_view_t,
+                            blno_nnz_view_t, clno_nnz_view_t,
+                            ascalar_nnz_view_t, bscalar_nnz_view_t,
+                            cscalar_nnz_view_t, ascalar_t, bscalar_t>
+        sortedNumeric(a_rowmap, b_rowmap, c_rowmap, a_entries, b_entries,
+                      c_entries, a_values, b_values, c_values, alpha, beta);
+    Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputSorted",
+                         range_type(0, nrows), sortedNumeric);
+  } else {
+    // use a_pos and b_pos (set in the handle by symbolic) to quickly compute C
+    // entries and values
+    UnsortedNumericSumFunctor<size_type, ordinal_type, alno_row_view_t,
+                              blno_row_view_t, clno_row_view_t, alno_nnz_view_t,
+                              blno_nnz_view_t, clno_nnz_view_t,
+                              ascalar_nnz_view_t, bscalar_nnz_view_t,
+                              cscalar_nnz_view_t, ascalar_t, bscalar_t>
+        unsortedNumeric(a_rowmap, b_rowmap, c_rowmap, a_entries, b_entries,
+                        c_entries, a_values, b_values, c_values, alpha, beta,
+                        addHandle->get_a_pos(), addHandle->get_b_pos());
+    Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputNotSorted",
+                         range_type(0, nrows), unsortedNumeric);
+  }
+  addHandle->set_call_numeric();
+}
+
+#undef SAME_TYPE
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#endif
diff --git a/src/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp
new file mode 100644
index 0000000000..7cc93e2715
--- /dev/null
+++ b/src/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp
@@ -0,0 +1,244 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOSSPARSE_IMPL_SPADD_NUMERIC_SPEC_HPP_
+#define KOKKOSSPARSE_IMPL_SPADD_NUMERIC_SPEC_HPP_
+
+#include <KokkosKernels_config.h>
+
+#include <Kokkos_Core.hpp>
+#include "KokkosKernels_Handle.hpp"
+// Include the actual functors
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+#include "KokkosSparse_spadd_numeric_impl.hpp"
+#endif
+
+namespace KokkosSparse {
+namespace Impl {
+// Specialization struct which defines whether a specialization exists
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class a_scalar_view_t, class b_size_view_t_, class b_lno_view_t,
+          class b_scalar_view_t, class c_size_view_t_, class c_lno_view_t,
+          class c_scalar_view_t>
+struct spadd_numeric_eti_spec_avail {
+  enum : bool { value = false };
+};
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_AVAIL(                        \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  template <>                                                             \
+  struct spadd_numeric_eti_spec_avail<                                    \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,               \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {          \
+    enum : bool { value = true };                                         \
+  };
+
+// Include the actual specialization declarations
+#include <KokkosSparse_spadd_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_avail.hpp>
+
+namespace KokkosSparse {
+namespace Impl {
+
+// Unification layer
+/// \brief Implementation of KokkosBlas::spadd (sparse-sparse matrix addition)
+
+template <class KernelHandle, class a_size_view_t, class a_lno_view_t,
+          class a_scalar_view_t, class b_size_view_t, class b_lno_view_t,
+          class b_scalar_view_t, class c_size_view_t, class c_lno_view_t,
+          class c_scalar_view_t,
+          bool tpl_spec_avail = spadd_numeric_tpl_spec_avail<
+              KernelHandle, a_size_view_t, a_lno_view_t, a_scalar_view_t,
+              b_size_view_t, b_lno_view_t, b_scalar_view_t, c_size_view_t,
+              c_lno_view_t, c_scalar_view_t>::value,
+          bool eti_spec_avail = spadd_numeric_eti_spec_avail<
+              KernelHandle, a_size_view_t, a_lno_view_t, a_scalar_view_t,
+              b_size_view_t, b_lno_view_t, b_scalar_view_t, c_size_view_t,
+              c_lno_view_t, c_scalar_view_t>::value>
+struct SPADD_NUMERIC {
+  static void spadd_numeric(KernelHandle *handle,
+                            typename a_scalar_view_t::const_value_type alpha,
+                            a_size_view_t row_mapA, a_lno_view_t entriesA,
+                            a_scalar_view_t valuesA,
+                            typename b_scalar_view_t::const_value_type beta,
+                            b_size_view_t row_mapB, b_lno_view_t entriesB,
+                            b_scalar_view_t valuesB, c_size_view_t row_mapC,
+                            c_lno_view_t entriesC, c_scalar_view_t valuesC);
+};
+
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+
+template <class KernelHandle, class a_size_view_t, class a_lno_view_t,
+          class a_scalar_view_t, class b_size_view_t, class b_lno_view_t,
+          class b_scalar_view_t, class c_size_view_t, class c_lno_view_t,
+          class c_scalar_view_t>
+struct SPADD_NUMERIC<KernelHandle, a_size_view_t, a_lno_view_t, a_scalar_view_t,
+                     b_size_view_t, b_lno_view_t, b_scalar_view_t,
+                     c_size_view_t, c_lno_view_t, c_scalar_view_t, false,
+                     KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  static void spadd_numeric(KernelHandle *handle,
+                            typename a_scalar_view_t::const_value_type alpha,
+                            a_size_view_t row_mapA, a_lno_view_t entriesA,
+                            a_scalar_view_t valuesA,
+                            typename b_scalar_view_t::const_value_type beta,
+                            b_size_view_t row_mapB, b_lno_view_t entriesB,
+                            b_scalar_view_t valuesB, c_size_view_t row_mapC,
+                            c_lno_view_t entriesC, c_scalar_view_t valuesC) {
+    spadd_numeric_impl(handle, row_mapA, entriesA, valuesA, alpha, row_mapB,
+                       entriesB, valuesB, beta, row_mapC, entriesC, valuesC);
+  }
+};
+
+#endif
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_DECL(                         \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  extern template struct SPADD_NUMERIC<                                   \
+      typename KokkosKernels::Experimental::KokkosKernelsHandle<          \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,               \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;
+
+#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_INST(                         \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  template struct SPADD_NUMERIC<                                          \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,               \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;
+
+#include <KokkosSparse_spadd_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp>
+
+#endif
diff --git a/src/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp
new file mode 100644
index 0000000000..c4ae435f55
--- /dev/null
+++ b/src/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp
@@ -0,0 +1,635 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Brian Kelley (bmkelle@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef _KOKKOS_SPADD_SYMBOLIC_IMPL_HPP
+#define _KOKKOS_SPADD_SYMBOLIC_IMPL_HPP
+
+#include "KokkosKernels_Handle.hpp"
+#include "KokkosSparse_SortCrs.hpp"
+#include "Kokkos_ArithTraits.hpp"
+
+namespace KokkosSparse {
+namespace Impl {
+
+// Helper macro to check that two types are the same (ignoring const)
+#define SAME_TYPE(A, B)                             \
+  std::is_same<typename std::remove_const<A>::type, \
+               typename std::remove_const<B>::type>::value
+
+// get C rowmap for sorted input
+template <typename size_type, typename ordinal_type, typename ARowPtrsT,
+          typename BRowPtrsT, typename AColIndsT, typename BColIndsT,
+          typename CRowPtrsT, typename ExecSpace>
+struct SortedCountEntriesRange {
+  SortedCountEntriesRange(ordinal_type nrows_,
+                          const typename ARowPtrsT::const_type& Arowptrs_,
+                          const AColIndsT& Acolinds_,
+                          const typename BRowPtrsT::const_type& Browptrs_,
+                          const BColIndsT& Bcolinds_,
+                          const CRowPtrsT& Crowcounts_)
+      : nrows(nrows_),
+        Arowptrs(Arowptrs_),
+        Acolinds(Acolinds_),
+        Browptrs(Browptrs_),
+        Bcolinds(Bcolinds_),
+        Crowcounts(Crowcounts_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const {
+    const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits<ordinal_type>::max();
+
+    // count the union of nonzeros in Arow and Brow
+    size_type numEntries = 0;
+    size_type ai         = 0;
+    size_type bi         = 0;
+    size_type Arowstart  = Arowptrs(i);
+    size_type Arowlen    = Arowptrs(i + 1) - Arowstart;
+    size_type Browstart  = Browptrs(i);
+    size_type Browlen    = Browptrs(i + 1) - Browstart;
+    ordinal_type Acol    = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart);
+    ordinal_type Bcol    = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart);
+    while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) {
+      ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol;
+      numEntries++;
+      // Eat all entries in both A and B which have this column
+      // This also results in Acol/Bcol being updated to following entries for
+      // next loop iter
+      while (Acol == Ccol)
+        Acol = (ai == Arowlen) ? ORDINAL_MAX : Acolinds(Arowstart + ai++);
+      while (Bcol == Ccol)
+        Bcol = (bi == Browlen) ? ORDINAL_MAX : Bcolinds(Browstart + bi++);
+    }
+    Crowcounts(i) = numEntries;
+  }
+
+  ordinal_type nrows;
+  const typename ARowPtrsT::const_type Arowptrs;
+  const AColIndsT Acolinds;
+  const typename BRowPtrsT::const_type Browptrs;
+  const BColIndsT Bcolinds;
+  CRowPtrsT Crowcounts;
+};
+
+template <typename size_type, typename ordinal_type, typename ARowPtrsT,
+          typename BRowPtrsT, typename AColIndsT, typename BColIndsT,
+          typename CRowPtrsT, typename ExecSpace>
+struct SortedCountEntriesTeam {
+  SortedCountEntriesTeam(ordinal_type nrows_,
+                         const typename ARowPtrsT::const_type& Arowptrs_,
+                         const AColIndsT& Acolinds_,
+                         const typename BRowPtrsT::const_type& Browptrs_,
+                         const BColIndsT& Bcolinds_,
+                         const CRowPtrsT& Crowcounts_)
+      : nrows(nrows_),
+        Arowptrs(Arowptrs_),
+        Acolinds(Acolinds_),
+        Browptrs(Browptrs_),
+        Bcolinds(Bcolinds_),
+        Crowcounts(Crowcounts_) {}
+
+  using TeamPol = Kokkos::TeamPolicy<ExecSpace>;
+  using TeamMem = typename TeamPol::member_type;
+
+  KOKKOS_INLINE_FUNCTION void longRowFallback(const ordinal_type i) const {
+    const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits<ordinal_type>::max();
+
+    // count the union of nonzeros in Arow and Brow
+    size_type numEntries = 0;
+    size_type ai         = 0;
+    size_type bi         = 0;
+    size_type Arowstart  = Arowptrs(i);
+    size_type Arowlen    = Arowptrs(i + 1) - Arowstart;
+    size_type Browstart  = Browptrs(i);
+    size_type Browlen    = Browptrs(i + 1) - Browstart;
+    ordinal_type Acol    = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart);
+    ordinal_type Bcol    = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart);
+    while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) {
+      ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol;
+      numEntries++;
+      // Eat all entries in both A and B which have this column
+      // This also results in Acol/Bcol being updated to following entries for
+      // next loop iter
+      while (Acol == Ccol)
+        Acol = (ai == Arowlen) ? ORDINAL_MAX : Acolinds(Arowstart + ai++);
+      while (Bcol == Ccol)
+        Bcol = (bi == Browlen) ? ORDINAL_MAX : Bcolinds(Browstart + bi++);
+    }
+    Crowcounts(i) = numEntries;
+  }
+
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const {
+    ordinal_type i = t.league_rank() * t.team_size() + t.team_rank();
+    if (i >= nrows) return;
+    ordinal_type* allScratch =
+        (ordinal_type*)t.team_shmem().get_shmem(totalShared);
+    ordinal_type* scratch  = allScratch + t.team_rank() * sharedPerThread;
+    ordinal_type Arowstart = Arowptrs(i);
+    ordinal_type Arowlen   = Arowptrs(i + 1) - Arowstart;
+    ordinal_type Browstart = Browptrs(i);
+    ordinal_type Browlen   = Browptrs(i + 1) - Browstart;
+    ordinal_type n         = Arowlen + Browlen;
+    if (n > sharedPerThread) {
+      // fall back to slow serial method
+      Kokkos::single(Kokkos::PerThread(t), [&]() { longRowFallback(i); });
+      return;
+    }
+    if (n == 0) {
+      Kokkos::single(Kokkos::PerThread(t), [&]() { Crowcounts(i) = 0; });
+      return;
+    }
+    // Figure out the number of bitonic steps: ceil(log2(n))
+    ordinal_type npot   = 1;
+    ordinal_type levels = 0;
+    while (npot < n) {
+      levels++;
+      npot <<= 1;
+    }
+    // Copy A and B entries to scratch
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(t, Arowlen),
+        [&](ordinal_type j) { scratch[j] = Acolinds(Arowstart + j); });
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, Browlen),
+                         [&](ordinal_type j) {
+                           scratch[npot - 1 - j] = Bcolinds(Browstart + j);
+                         });
+    // Fill space between A and B with ORDINAL_MAX,
+    // to maintain a valid bitonic sequence of power-of-two length
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(t, npot - n), [&](ordinal_type j) {
+          scratch[Arowlen + j] = Kokkos::ArithTraits<ordinal_type>::max();
+        });
+    // npot = 2^levels
+    for (ordinal_type level = 0; level < levels; level++) {
+      // npot/2 pairs of items are compared in parallel
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, npot >> 1),
+                           [&](const ordinal_type j) {
+                             ordinal_type boxSize = npot >> level;
+                             // Which box contains this thread?
+                             // box = (j / boxSize), and boxSize =
+                             // 2^(levels-level), so box = j * 2^(level-levels)
+                             // = j >> (levels - level)
+                             ordinal_type boxID = (j * 2) >> (levels - level);
+                             // boxStart = boxID * boxSize = boxID *
+                             // 2^(levels-level) = boxID << (levels-level)
+                             ordinal_type boxStart  = boxID << (levels - level);
+                             ordinal_type boxOffset = j - boxID * boxSize / 2;
+                             ordinal_type elem1     = boxStart + boxOffset;
+                             ordinal_type elem2     = elem1 + (boxSize >> 1);
+                             if (scratch[elem2] < scratch[elem1]) {
+                               ordinal_type temp = scratch[elem1];
+                               scratch[elem1]    = scratch[elem2];
+                               scratch[elem2]    = temp;
+                             }
+                           });
+    }
+    // Finally, count the number of distinct entries (this is #rising edges + 1)
+    ordinal_type risingEdges;
+    Kokkos::parallel_reduce(
+        Kokkos::ThreadVectorRange(t, n - 1),
+        [&](const ordinal_type j, ordinal_type& lcount) {
+          if (scratch[j] != scratch[j + 1]) lcount++;
+        },
+        risingEdges);
+    Kokkos::single(Kokkos::PerThread(t),
+                   [&]() { Crowcounts(i) = risingEdges + 1; });
+  }
+
+  size_t team_shmem_size(int teamSize) const {
+    return sharedPerThread * sizeof(ordinal_type) * teamSize;
+  }
+
+  ordinal_type nrows;
+  const typename ARowPtrsT::const_type Arowptrs;
+  const AColIndsT Acolinds;
+  const typename BRowPtrsT::const_type Browptrs;
+  const BColIndsT Bcolinds;
+  CRowPtrsT Crowcounts;
+  int sharedPerThread;  // Shared for each thread, measured in
+                        // sizeof(ordinal_type)
+  int totalShared;      // Shared for whole team, measured in bytes
+};
+
+// get upper bound for C entries per row (assumes worst case, that entries in A
+// and B on each row are disjoint)
+template <typename size_type, typename ordinal_type, typename ARowPtrsT,
+          typename BRowPtrsT, typename CRowPtrsT>
+struct UnsortedEntriesUpperBound {
+  UnsortedEntriesUpperBound(ordinal_type nrows_,
+                            const typename ARowPtrsT::const_type& Arowptrs_,
+                            const typename BRowPtrsT::const_type& Browptrs_,
+                            const CRowPtrsT& Crowcounts_)
+      : nrows(nrows_),
+        Arowptrs(Arowptrs_),
+        Browptrs(Browptrs_),
+        Crowcounts(Crowcounts_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const {
+    Crowcounts(i) =
+        (Arowptrs(i + 1) - Arowptrs(i)) + (Browptrs(i + 1) - Browptrs(i));
+    if (i == nrows - 1) {
+      // last workitem also zeros the one-past-end entry of row counts, so
+      // that prefix sum is correct
+      Crowcounts(nrows) = 0;
+    }
+  }
+  ordinal_type nrows;
+  const typename ARowPtrsT::const_type Arowptrs;
+  const typename BRowPtrsT::const_type Browptrs;
+  CRowPtrsT Crowcounts;
+};
+
+// Unsorted symbolic: new functors:
+//  -compute uncompressed C (entries only, no values)
+//  -sort uncompressed C entries within row, while permuting A union B
+//  permutation array -compress sorted C entries and A,B perm arrays at the same
+//  time, which produces Crowcounts value
+// Inputs: A, B rowptrs/colinds, C uncompressed rowptrs (and allocated C
+// entries) Output: C uncompressed colinds
+template <typename size_type, typename ordinal_type, typename ArowptrsT,
+          typename BrowptrsT, typename CrowptrsT, typename AcolindsT,
+          typename BcolindsT, typename CcolindsT>
+struct UnmergedSumFunctor {
+  UnmergedSumFunctor(ordinal_type nrows_, const ArowptrsT& Arowptrs_,
+                     const AcolindsT& Acolinds_, const BrowptrsT& Browptrs_,
+                     const BcolindsT& Bcolinds_, const CrowptrsT& Crowptrs_,
+                     const CcolindsT& Ccolinds_, const CcolindsT& ABperm_)
+      : nrows(nrows_),
+        Arowptrs(Arowptrs_),
+        Acolinds(Acolinds_),
+        Browptrs(Browptrs_),
+        Bcolinds(Bcolinds_),
+        Crowptrs(Crowptrs_),
+        Ccolinds(Ccolinds_),
+        ABperm(ABperm_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const {
+    size_type inserted  = 0;
+    size_type crowstart = Crowptrs(i);
+    size_type arowstart = Arowptrs(i);
+    size_type arowlen   = Arowptrs(i + 1) - arowstart;
+    size_type browstart = Browptrs(i);
+    size_type browlen   = Browptrs(i + 1) - browstart;
+    // Insert all A entries, then all B entries
+    for (size_type j = 0; j < arowlen; j++) {
+      Ccolinds(crowstart + inserted) = Acolinds(arowstart + j);
+      ABperm(crowstart + inserted)   = j;
+      inserted++;
+    }
+    for (size_type j = 0; j < browlen; j++) {
+      Ccolinds(crowstart + inserted) = Bcolinds(browstart + j);
+      // tell A and B permutation values apart by adding arowlen as a bias to B
+      // values
+      ABperm(crowstart + inserted) = j + arowlen;
+      inserted++;
+    }
+  }
+  ordinal_type nrows;
+  const ArowptrsT Arowptrs;
+  const AcolindsT Acolinds;
+  const BrowptrsT Browptrs;
+  const BcolindsT Bcolinds;
+  const CrowptrsT Crowptrs;
+  CcolindsT Ccolinds;
+  CcolindsT ABperm;
+};
+
+template <typename size_type, typename ordinal_type, typename ArowptrsT,
+          typename BrowptrsT, typename CrowptrsT, typename CcolindsT,
+          typename OffsetView>
+struct MergeEntriesFunctor {
+  MergeEntriesFunctor(ordinal_type nrows_, const ArowptrsT& Arowptrs_,
+                      const BrowptrsT& Browptrs_, const OffsetView& Crowptrs_,
+                      const CrowptrsT& Crowcounts_, const CcolindsT& Ccolinds_,
+                      const CcolindsT& ABperm_, const CcolindsT& Apos_,
+                      const CcolindsT& Bpos_)
+      : nrows(nrows_),
+        Arowptrs(Arowptrs_),
+        Browptrs(Browptrs_),
+        Crowptrs(Crowptrs_),
+        Crowcounts(Crowcounts_),
+        Ccolinds(Ccolinds_),
+        ABperm(ABperm_),
+        Apos(Apos_),
+        Bpos(Bpos_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const {
+    size_type CrowStart = Crowptrs(i);
+    size_type CrowEnd   = Crowptrs(i + 1);
+    if (CrowEnd == CrowStart) {
+      Crowcounts(i) = 0;
+      return;
+    }
+    size_type ArowStart = Arowptrs(i);
+    size_type ArowNum   = Arowptrs(i + 1) - ArowStart;
+    size_type BrowStart = Browptrs(i);
+    ordinal_type CFit   = 0;  // counting through merged C indices (within row)
+    for (size_type Cit = CrowStart; Cit < CrowEnd; Cit++) {
+      if ((Cit > CrowStart) && (Ccolinds(Cit) != Ccolinds(Cit - 1))) {
+        // This is a different column than the previous entry, and is not the
+        // first entry. This means that this is the first occurence of a unique
+        // column.
+        CFit++;
+      }
+      size_type permVal = ABperm(Cit);
+      if (permVal < ArowNum) {
+        // Entry belongs to A
+        ordinal_type Aindex = permVal;
+        // The Aindex'th entry in row i of A will be added into the CFit'th
+        // entry in C
+        Apos(ArowStart + Aindex) = CFit;
+      } else {
+        // Entry belongs to B
+        ordinal_type Bindex = permVal - ArowNum;
+        // The Bindex'th entry in row i of B will be added into the CFit'th
+        // entry in C
+        Bpos(BrowStart + Bindex) = CFit;
+      }
+    }
+    // At end of the row, know how many entries are in merged C.
+    // Right now, CFit is the index of the last Apos/Bpos,
+    // so adding one gives the total number of entries.
+    Crowcounts(i) = CFit + 1;
+  }
+  ordinal_type nrows;
+  const ArowptrsT Arowptrs;
+  const BrowptrsT Browptrs;
+  const OffsetView Crowptrs;
+  CrowptrsT Crowcounts;
+  CcolindsT Ccolinds;
+  const CcolindsT ABperm;
+  CcolindsT Apos;
+  CcolindsT Bpos;
+};
+
+// Run SortedCountEntries: non-GPU, always uses the RangePolicy version.
+template <typename KernelHandle, typename alno_row_view_t_,
+          typename alno_nnz_view_t_, typename blno_row_view_t_,
+          typename blno_nnz_view_t_, typename clno_row_view_t_>
+void runSortedCountEntries(
+    const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries,
+    const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries,
+    const clno_row_view_t_& c_rowmap,
+    typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<
+        typename KernelHandle::SPADDHandleType::execution_space>()>::type* =
+        nullptr) {
+  using size_type    = typename KernelHandle::size_type;
+  using ordinal_type = typename KernelHandle::nnz_lno_t;
+  using execution_space =
+      typename KernelHandle::SPADDHandleType::execution_space;
+  using range_type = Kokkos::RangePolicy<execution_space>;
+  auto nrows       = c_rowmap.extent(0) - 1;
+  SortedCountEntriesRange<size_type, ordinal_type, alno_row_view_t_,
+                          blno_row_view_t_, alno_nnz_view_t_, blno_nnz_view_t_,
+                          clno_row_view_t_, execution_space>
+      countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap);
+  Kokkos::parallel_for(
+      "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries",
+      range_type(0, nrows), countEntries);
+}
+
+// Run SortedCountEntries: GPU, uses the TeamPolicy or RangePolicy depending
+//  on average nz per row (a runtime decision)
+template <typename KernelHandle, typename alno_row_view_t_,
+          typename alno_nnz_view_t_, typename blno_row_view_t_,
+          typename blno_nnz_view_t_, typename clno_row_view_t_>
+void runSortedCountEntries(
+    const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries,
+    const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries,
+    const clno_row_view_t_& c_rowmap,
+    typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<
+        typename KernelHandle::SPADDHandleType::execution_space>()>::type* =
+        nullptr) {
+  using size_type    = typename KernelHandle::size_type;
+  using ordinal_type = typename KernelHandle::nnz_lno_t;
+  using execution_space =
+      typename KernelHandle::SPADDHandleType::execution_space;
+  using RangePol = Kokkos::RangePolicy<execution_space>;
+  using TeamPol  = Kokkos::TeamPolicy<execution_space>;
+  auto nrows     = c_rowmap.extent(0) - 1;
+  size_type c_est_nnz =
+      1.4 * (a_entries.extent(0) + b_entries.extent(0)) / nrows;
+  if (c_est_nnz <= 512) {
+    // Convert c_est_nnz to a power of 2
+    size_type pot_est_nnz = 1;
+    while (pot_est_nnz < c_est_nnz) pot_est_nnz *= 2;
+    // Estimate max number of uncompressed entries in each row of C
+    int vector_length = 1;
+    int vector_length_max =
+        KokkosKernels::Impl::kk_get_max_vector_size<execution_space>();
+    while (vector_length * 2 <= vector_length_max &&
+           (size_type)vector_length * 2 <= pot_est_nnz) {
+      vector_length *= 2;
+    }
+    SortedCountEntriesTeam<size_type, ordinal_type, alno_row_view_t_,
+                           blno_row_view_t_, alno_nnz_view_t_, blno_nnz_view_t_,
+                           clno_row_view_t_, execution_space>
+        countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap);
+    countEntries.sharedPerThread = pot_est_nnz;
+    // compute largest possible team size
+    TeamPol testPolicy(1, 1, vector_length);
+    testPolicy.set_scratch_size(
+        0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type)));
+    int team_size = testPolicy.team_size_recommended(countEntries,
+                                                     Kokkos::ParallelForTag());
+    // construct real policy
+    int league_size = (nrows + team_size - 1) / team_size;
+    TeamPol policy(league_size, team_size, vector_length);
+    policy.set_scratch_size(
+        0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type)));
+    countEntries.totalShared =
+        countEntries.sharedPerThread * team_size * sizeof(ordinal_type);
+    Kokkos::parallel_for(
+        "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", policy,
+        countEntries);
+  } else {
+    SortedCountEntriesRange<size_type, ordinal_type, alno_row_view_t_,
+                            blno_row_view_t_, alno_nnz_view_t_,
+                            blno_nnz_view_t_, clno_row_view_t_, execution_space>
+        countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap);
+    Kokkos::parallel_for(
+        "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries",
+        RangePol(0, nrows), countEntries);
+  }
+}
+
+// Symbolic: count entries in each row in C to produce rowmap
+// kernel handle has information about whether it is sorted add or not.
+template <typename KernelHandle, typename alno_row_view_t_,
+          typename alno_nnz_view_t_, typename blno_row_view_t_,
+          typename blno_nnz_view_t_, typename clno_row_view_t_>
+void spadd_symbolic_impl(
+    KernelHandle* handle, const alno_row_view_t_ a_rowmap,
+    const alno_nnz_view_t_ a_entries, const blno_row_view_t_ b_rowmap,
+    const blno_nnz_view_t_ b_entries,
+    clno_row_view_t_ c_rowmap)  // c_rowmap must already be allocated (doesn't
+                                // need to be initialized)
+{
+  typedef
+      typename KernelHandle::SPADDHandleType::execution_space execution_space;
+  typedef typename KernelHandle::size_type size_type;
+  typedef typename KernelHandle::nnz_lno_t ordinal_type;
+  typedef typename KernelHandle::SPADDHandleType::nnz_lno_view_t ordinal_view_t;
+  typedef typename KernelHandle::SPADDHandleType::nnz_row_view_t offset_view_t;
+  // Check that A/B/C data types match KernelHandle types, and that C data types
+  // are nonconst (doesn't matter if A/B types are const)
+  static_assert(
+      SAME_TYPE(typename alno_row_view_t_::non_const_value_type, size_type),
+      "add_symbolic: A size_type must match KernelHandle size_type (const "
+      "doesn't matter)");
+  static_assert(
+      SAME_TYPE(typename blno_row_view_t_::non_const_value_type, size_type),
+      "add_symbolic: B size_type must match KernelHandle size_type (const "
+      "doesn't matter)");
+  static_assert(
+      SAME_TYPE(typename clno_row_view_t_::non_const_value_type, size_type),
+      "add_symbolic: C size_type must match KernelHandle size_type)");
+  static_assert(std::is_same<typename clno_row_view_t_::non_const_value_type,
+                             typename clno_row_view_t_::value_type>::value,
+                "add_symbolic: C size_type must not be const");
+  static_assert(
+      SAME_TYPE(typename alno_nnz_view_t_::non_const_value_type, ordinal_type),
+      "add_symbolic: A entry type must match KernelHandle entry type (aka "
+      "nnz_lno_t, and const doesn't matter)");
+  static_assert(
+      SAME_TYPE(typename blno_nnz_view_t_::non_const_value_type, ordinal_type),
+      "add_symbolic: B entry type must match KernelHandle entry type (aka "
+      "nnz_lno_t, and const doesn't matter)");
+  static_assert(std::is_same<typename clno_row_view_t_::non_const_value_type,
+                             typename clno_row_view_t_::value_type>::value,
+                "add_symbolic: C entry type must not be const");
+  // symbolic just needs to compute c_rowmap
+  // easy for sorted, but for unsorted is easiest to just compute the whole sum
+  auto addHandle = handle->get_spadd_handle();
+  if (a_rowmap.extent(0) == 0 || a_rowmap.extent(0) == 1) {
+    // Have 0 rows, so nothing to do except set #nnz to 0
+    addHandle->set_c_nnz(0);
+    // If c_rowmap has a single entry, it must be 0
+    if (c_rowmap.extent(0)) Kokkos::deep_copy(c_rowmap, (size_type)0);
+    addHandle->set_call_symbolic();
+    return;
+  }
+  ordinal_type nrows = a_rowmap.extent(0) - 1;
+  typedef Kokkos::RangePolicy<execution_space, ordinal_type> range_type;
+  if (addHandle->is_input_sorted()) {
+    runSortedCountEntries<KernelHandle, alno_row_view_t_, alno_nnz_view_t_,
+                          blno_row_view_t_, blno_nnz_view_t_, clno_row_view_t_>(
+        a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap);
+    KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<clno_row_view_t_,
+                                                          execution_space>(
+        nrows + 1, c_rowmap);
+  } else {
+    // note: scoping individual parts of the process to free views sooner,
+    // minimizing peak memory usage run the unsorted c_rowmap upper bound
+    // functor (just adds together A and B entry counts row by row)
+    offset_view_t c_rowmap_upperbound(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "C row counts upper bound"),
+        nrows + 1);
+    size_type c_nnz_upperbound = 0;
+    {
+      UnsortedEntriesUpperBound<size_type, ordinal_type, alno_row_view_t_,
+                                blno_row_view_t_, offset_view_t>
+          countEntries(nrows, a_rowmap, b_rowmap, c_rowmap_upperbound);
+      Kokkos::parallel_for(
+          "KokkosSparse::SpAdd:Symbolic::InputNotSorted::CountEntries",
+          range_type(0, nrows), countEntries);
+      KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<offset_view_t,
+                                                            execution_space>(
+          nrows + 1, c_rowmap_upperbound);
+      Kokkos::deep_copy(c_nnz_upperbound,
+                        Kokkos::subview(c_rowmap_upperbound, nrows));
+    }
+    ordinal_view_t c_entries_uncompressed(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "C entries uncompressed"),
+        c_nnz_upperbound);
+    ordinal_view_t ab_perm(Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                                              "A and B permuted entry indices"),
+                           c_nnz_upperbound);
+    // compute the unmerged sum
+    UnmergedSumFunctor<size_type, ordinal_type, alno_row_view_t_,
+                       blno_row_view_t_, offset_view_t, alno_nnz_view_t_,
+                       blno_nnz_view_t_, ordinal_view_t>
+        unmergedSum(nrows, a_rowmap, a_entries, b_rowmap, b_entries,
+                    c_rowmap_upperbound, c_entries_uncompressed, ab_perm);
+    Kokkos::parallel_for(
+        "KokkosSparse::SpAdd:Symbolic::InputNotSorted::UnmergedSum",
+        range_type(0, nrows), unmergedSum);
+    // sort the unmerged sum
+    KokkosSparse::sort_crs_matrix<execution_space, offset_view_t,
+                                  ordinal_view_t, ordinal_view_t>(
+        c_rowmap_upperbound, c_entries_uncompressed, ab_perm);
+    ordinal_view_t a_pos(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "A entry positions"),
+        a_entries.extent(0));
+    ordinal_view_t b_pos(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "B entry positions"),
+        b_entries.extent(0));
+    // merge the entries and compute Apos/Bpos, as well as Crowcounts
+    {
+      MergeEntriesFunctor<size_type, ordinal_type, alno_row_view_t_,
+                          blno_row_view_t_, offset_view_t, ordinal_view_t,
+                          offset_view_t>
+          mergeEntries(nrows, a_rowmap, b_rowmap, c_rowmap_upperbound, c_rowmap,
+                       c_entries_uncompressed, ab_perm, a_pos, b_pos);
+      Kokkos::parallel_for(
+          "KokkosSparse::SpAdd:Symbolic::InputNotSorted::MergeEntries",
+          range_type(0, nrows), mergeEntries);
+      // compute actual c_rowmap
+      KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<clno_row_view_t_,
+                                                            execution_space>(
+          nrows + 1, c_rowmap);
+    }
+    addHandle->set_a_b_pos(a_pos, b_pos);
+  }
+  // provide the number of NNZ in C to user through handle
+  size_type cmax;
+  Kokkos::deep_copy(cmax, Kokkos::subview(c_rowmap, nrows));
+  addHandle->set_c_nnz(cmax);
+  addHandle->set_call_symbolic();
+  addHandle->set_call_numeric(false);
+}
+
+#undef SAME_TYPE
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#endif
diff --git a/src/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp b/src/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp
new file mode 100644
index 0000000000..7a48999e6a
--- /dev/null
+++ b/src/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp
@@ -0,0 +1,189 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOSSPARSE_IMPL_SPADD_SYMBOLIC_SPEC_HPP_
+#define KOKKOSSPARSE_IMPL_SPADD_SYMBOLIC_SPEC_HPP_
+
+#include <KokkosKernels_config.h>
+
+#include <Kokkos_Core.hpp>
+#include "KokkosKernels_Handle.hpp"
+// Include the actual functors
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+#include "KokkosSparse_spadd_symbolic_impl.hpp"
+#endif
+
+namespace KokkosSparse {
+namespace Impl {
+// Specialization struct which defines whether a specialization exists
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class b_size_view_t_, class b_lno_view_t, class c_size_view_t_>
+struct spadd_symbolic_eti_spec_avail {
+  enum : bool { value = false };
+};
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_AVAIL(                       \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  template <>                                                             \
+  struct spadd_symbolic_eti_spec_avail<                                   \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,               \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {          \
+    enum : bool { value = true };                                         \
+  };
+
+// Include the actual specialization declarations
+#include <KokkosSparse_spadd_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_avail.hpp>
+
+namespace KokkosSparse {
+namespace Impl {
+
+// Unification layer
+/// \brief Implementation of KokkosBlas::spadd (sparse-sparse matrix addition)
+
+template <class KernelHandle, class a_size_view_t, class a_lno_view_t,
+          class b_size_view_t, class b_lno_view_t, class c_size_view_t,
+          bool tpl_spec_avail = spadd_symbolic_tpl_spec_avail<
+              KernelHandle, a_size_view_t, a_lno_view_t, b_size_view_t,
+              b_lno_view_t, c_size_view_t>::value,
+          bool eti_spec_avail = spadd_symbolic_eti_spec_avail<
+              KernelHandle, a_size_view_t, a_lno_view_t, b_size_view_t,
+              b_lno_view_t, c_size_view_t>::value>
+struct SPADD_SYMBOLIC {
+  static void spadd_symbolic(KernelHandle *handle, a_size_view_t row_mapA,
+                             a_lno_view_t entriesA, b_size_view_t row_mapB,
+                             b_lno_view_t entriesB, c_size_view_t row_mapC);
+};
+
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+
+template <class KernelHandle, class a_size_view_t, class a_lno_view_t,
+          class b_size_view_t, class b_lno_view_t, class c_size_view_t>
+struct SPADD_SYMBOLIC<KernelHandle, a_size_view_t, a_lno_view_t, b_size_view_t,
+                      b_lno_view_t, c_size_view_t, false,
+                      KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  static void spadd_symbolic(KernelHandle *handle, a_size_view_t row_mapA,
+                             a_lno_view_t entriesA, b_size_view_t row_mapB,
+                             b_lno_view_t entriesB, c_size_view_t row_mapC) {
+    spadd_symbolic_impl(handle, row_mapA, entriesA, row_mapB, entriesB,
+                        row_mapC);
+  }
+};
+
+#endif
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL(                        \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  extern template struct SPADD_SYMBOLIC<                                  \
+      typename KokkosKernels::Experimental::KokkosKernelsHandle<          \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,               \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;
+
+#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_INST(                        \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  template struct SPADD_SYMBOLIC<                                         \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,               \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;
+
+#include <KokkosSparse_spadd_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp>
+
+#endif
diff --git a/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp b/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp
index e566e8bf06..c6a24e2163 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp
@@ -509,7 +509,7 @@ struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile size_type &dst, const volatile size_type &src) const {
+  void join(size_type &dst, const size_type &src) const {
     if (dst < src) {
       dst = src;
     }
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp
index 09a8bf212a..dadc944b09 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp
@@ -45,17 +45,9 @@
 #ifndef _KOKKOSSPGEMMIMPL_HPP
 #define _KOKKOSSPGEMMIMPL_HPP
 
-//#define KOKKOSKERNELS_ANALYZE_COMPRESSION
-//#define KOKKOSKERNELS_ANALYZE_MEMORYACCESS
-//#define HASHTRACK
-
-//#define TRACK_INSERTS
-//#define GPU_EXPERIMENTAL
-//#define NUMERIC_USE_STATICMEM
-//#define twostep
 #include <KokkosKernels_Utils.hpp>
 #include <KokkosKernels_SimpleUtils.hpp>
-#include <KokkosKernels_SparseUtils.hpp>
+#include <KokkosSparse_Utils.hpp>
 #include <KokkosKernels_VectorUtils.hpp>
 #include <fstream>
 #include <sstream>
@@ -282,7 +274,7 @@ class KokkosSPGEMM {
   typedef Kokkos::TeamPolicy<MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >
       dynamic_team_policy_t;
 
- private:
+ protected:
   HandleType *handle;
   nnz_lno_t a_row_cnt;
   nnz_lno_t b_row_cnt;
@@ -795,7 +787,7 @@ class KokkosSPGEMM {
       typename c_scalar_nnz_view_t::const_value_type omega, dinv_view_t dinv,
       KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space);
 
- private:
+ protected:
   template <typename c_row_view_t, typename c_lno_nnz_view_t,
             typename c_scalar_nnz_view_t, typename dinv_view_t>
   void KokkosSPGEMM_jacobi_denseacc(
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp
index ce3501c447..32492482fe 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp
@@ -95,17 +95,17 @@ void spgemm_debug_symbolic(KernelHandle *handle,
     lno_t row_size              = 0;
 
     for (lno_t j = 0; j < a_row_size; ++j) {
-      size_type ind = a_row_begin + j;
-      lno_t col     = h_enta(ind);
-      // scalar_t val = h_vala(ind);
+      size_type a_ind = a_row_begin + j;
+      lno_t col       = h_enta(a_ind);
+      // scalar_t val = h_vala(a_ind);
 
       const size_type b_row_begin = h_rmb(col);
       const size_type b_row_end   = h_rmb(col + 1);
       lno_t b_row_size            = b_row_end - b_row_begin;
       for (lno_t z = 0; z < b_row_size; ++z) {
-        size_type ind_ = b_row_begin + z;
-        lno_t b_col    = h_entb(ind_);
-        // scalar_t b_val = h_valb(ind_);
+        size_type b_ind = b_row_begin + z;
+        lno_t b_col     = h_entb(b_ind);
+        // scalar_t b_val = h_valb(b_ind);
         // if (i == 0) std::cout << "\tb col:" <<  b_col << std::endl;
         if (acc_flag[b_col] == false) {
           acc_flag[b_col]                  = true;
@@ -194,16 +194,16 @@ void spgemm_debug_numeric(KernelHandle * /* handle */,
     lno_t c_row_size_counter = 0;
 
     for (lno_t j = 0; j < a_row_size; ++j) {
-      size_type ind               = a_row_begin + j;
-      lno_t col                   = h_enta(ind);
-      scalar_t val                = h_vala(ind);
+      size_type a_ind             = a_row_begin + j;
+      lno_t col                   = h_enta(a_ind);
+      scalar_t val                = h_vala(a_ind);
       const size_type b_row_begin = h_rmb(col);
       const size_type b_row_end   = h_rmb(col + 1);
       lno_t b_row_size            = b_row_end - b_row_begin;
       for (lno_t z = 0; z < b_row_size; ++z) {
-        size_type ind_ = b_row_begin + z;
-        lno_t b_col    = h_entb(ind_);
-        scalar_t b_val = h_valb(ind_);
+        size_type b_ind = b_row_begin + z;
+        lno_t b_col     = h_entb(b_ind);
+        scalar_t b_val  = h_valb(b_ind);
 
         if (acc_flag[b_col] == false) {
           acc_flag[b_col]                            = true;
@@ -216,9 +216,9 @@ void spgemm_debug_numeric(KernelHandle * /* handle */,
     // if (i == 0) std::cout << "result_cols" << std::endl;
 
     for (lno_t j = 0; j < c_row_size; ++j) {
-      size_type ind           = c_row_begin + j;
-      lno_t result_col        = h_entc(ind);
-      h_valc(ind)             = accumulator[result_col];
+      size_type c_ind         = c_row_begin + j;
+      lno_t result_col        = h_entc(c_ind);
+      h_valc(c_ind)           = accumulator[result_col];
       accumulator[result_col] = 0;
       acc_flag[result_col]    = false;
     }
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp
index bc185c0cd1..847d765cb4 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp
@@ -156,9 +156,9 @@ struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
 
           nnz_lno_t current_col_index = 0;
           const size_type col_begin   = row_mapA[row_index];
-          const nnz_lno_t nnza = nnz_lno_t(row_mapA[row_index + 1] - col_begin);
+          const nnz_lno_t row_size    = row_mapA[row_index + 1] - col_begin;
 
-          for (nnz_lno_t colind = 0; colind < nnza; ++colind) {
+          for (nnz_lno_t colind = 0; colind < row_size; ++colind) {
             size_type a_col = colind + col_begin;
             nnz_lno_t rowB  = entriesA[a_col];
             scalar_t valA   = valuesA[a_col];
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
index 2b7c4e3b38..9fc1b8fe72 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
@@ -1318,7 +1318,7 @@ struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile size_type &dst, const volatile size_type &src) const {
+  void join(size_type &dst, const size_type &src) const {
     if (dst < src) {
       dst = src;
     }
@@ -1410,7 +1410,7 @@ struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile size_type &dst, const volatile size_type &src) const {
+  void join(size_type &dst, const size_type &src) const {
     if (dst < src) {
       dst = src;
     }
@@ -2377,7 +2377,7 @@ struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile size_type &dst, const volatile size_type &src) const {
+  void join(size_type &dst, const size_type &src) const {
     if (dst < src) {
       dst = src;
     }
diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp
index 5715c7f098..e6f0c26497 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp
@@ -50,8 +50,6 @@
 #endif
 
 #include "KokkosKernels_Utils.hpp"
-#include <Kokkos_Concepts.hpp>
-#include <vector>
 
 namespace KokkosSparse {
 namespace Impl {
@@ -302,6 +300,11 @@ void mkl2phase_symbolic(
     (void)transposeA;
     (void)transposeB;
     (void)verbose;
+    (void)a_xadj;
+    (void)b_xadj;
+    (void)c_xadj;
+    (void)a_adj;
+    (void)b_adj;
 #endif
 
   } else {
@@ -351,9 +354,7 @@ void mkl2phase_apply(
       typename KernelHandle::HandlePersistentMemorySpace;
   using int_persistent_work_view_t =
       typename Kokkos::View<int *, HandlePersistentMemorySpace>;
-  using MyExecSpace = typename KernelHandle::HandleExecSpace;
-  using value_type  = typename KernelHandle::nnz_scalar_t;
-  using idx         = typename KernelHandle::nnz_lno_t;
+  using idx = typename KernelHandle::nnz_lno_t;
 
   if (std::is_same<idx, int>::value) {
     int *a_xadj = (int *)row_mapA.data();
@@ -639,6 +640,11 @@ void mkl2phase_apply(
     (void)transposeA;
     (void)transposeB;
     (void)verbose;
+    (void)a_xadj;
+    (void)b_xadj;
+    (void)c_xadj;
+    (void)a_adj;
+    (void)b_adj;
 #endif  // __INTEL_MKL__ == 2018 && __INTEL_MKL_UPDATE__ >= 2
   } else {
     (void)m;
diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp
index 8eb0bd3930..9a6ab70f9e 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp
@@ -45,637 +45,270 @@
 #ifndef _KOKKOSSPGEMMMKL_HPP
 #define _KOKKOSSPGEMMMKL_HPP
 
+#include "KokkosKernels_config.h"
+#include "KokkosSparse_Utils_mkl.hpp"
+
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
 #include "mkl_spblas.h"
-#include "mkl.h"
-#endif
-
-#include "KokkosKernels_Utils.hpp"
-#include <Kokkos_Concepts.hpp>
 
 namespace KokkosSparse {
-
 namespace Impl {
 
-template <typename KernelHandle, typename in_row_index_view_type,
-          typename in_nonzero_index_view_type, typename bin_row_index_view_type,
-          typename bin_nonzero_index_view_type,
-          typename cin_row_index_view_type>
-void mkl_symbolic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
-                  typename KernelHandle::nnz_lno_t n,
-                  typename KernelHandle::nnz_lno_t k,
-                  in_row_index_view_type row_mapA,
-                  in_nonzero_index_view_type entriesA,
-
-                  bool transposeA, bin_row_index_view_type row_mapB,
-                  bin_nonzero_index_view_type entriesB, bool transposeB,
-                  cin_row_index_view_type row_mapC, bool verbose = false) {
-#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+// multiplies two sparse MKL matrices and returns sparse MKL matrix
+template <typename value_type>
+inline static MKLSparseMatrix<value_type> mkl_spmm(
+    sparse_operation_t operation, const MKLSparseMatrix<value_type> &A,
+    const MKLSparseMatrix<value_type> &B) {
+  sparse_matrix_t C;
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_spmm(operation, A, B, &C));
+  return MKLSparseMatrix<value_type>(C);
+}
 
-  typedef typename KernelHandle::nnz_lno_t idx;
+template <typename KernelHandle, typename a_rowmap_view_type,
+          typename a_index_view_type, typename a_values_view_type,
+          typename b_rowmap_view_type, typename b_index_view_type,
+          typename b_values_view_type, typename c_rowmap_view_type,
+          typename c_index_view_type, typename c_values_view_type>
+class MKL_SPGEMM {
+ public:
+  typedef typename KernelHandle::nnz_lno_t nnz_lno_t;
   typedef typename KernelHandle::size_type size_type;
-
-  typedef typename KernelHandle::HandleTempMemorySpace HandleTempMemorySpace;
-  typedef typename Kokkos::View<int *, HandleTempMemorySpace>
-      int_temp_work_view_t;
-
   typedef typename KernelHandle::nnz_scalar_t value_type;
-
   typedef typename KernelHandle::HandleExecSpace MyExecSpace;
-  /*
-    if (!(
-        (Kokkos::SpaceAccessibility<typename
-    Kokkos::HostSpace::execution_space, typename
-    device1::memory_space>::accessible) &&
-        (Kokkos::SpaceAccessibility<typename
-    Kokkos::HostSpace::execution_space, typename
-    device2::memory_space>::accessible) &&
-        (Kokkos::SpaceAccessibility<typename
-    Kokkos::HostSpace::execution_space, typename
-    device3::memory_space>::accessible) )
-        ){
-      throw std::runtime_error ("MEMORY IS NOT ALLOCATED IN HOST DEVICE for
-    MKL\n"); return;
-    }
-  */
-  if (std::is_same<idx, int>::value) {
-    int *a_xadj = NULL;
-    int *b_xadj = NULL;
-    int_temp_work_view_t a_xadj_v, b_xadj_v;
-
-    if (std::is_same<size_type, int>::value) {
-      a_xadj = (int *)row_mapA.data();
-      b_xadj = (int *)row_mapB.data();
-    } else {
-      // TODO test this case.
-
-      Kokkos::Timer copy_time;
-      const int max_integer = 2147483647;
-      if (entriesB.extent(0) > max_integer ||
-          entriesA.extent(0) > max_integer) {
-        throw std::runtime_error(
-            "MKL requires integer values for size type for SPGEMM. Copying to "
-            "integer will cause overflow.\n");
-        return;
-      }
-      a_xadj_v = int_temp_work_view_t("tmpa", m + 1);
-      a_xadj   = (int *)a_xadj_v.data();
-      b_xadj_v = int_temp_work_view_t("tmpb", n + 1);
-      b_xadj   = (int *)b_xadj_v.data();
-
-      KokkosKernels::Impl::copy_vector<in_row_index_view_type,
-                                       int_temp_work_view_t, MyExecSpace>(
-          m + 1, row_mapA, a_xadj_v);
-
-      KokkosKernels::Impl::copy_vector<bin_row_index_view_type,
-                                       int_temp_work_view_t, MyExecSpace>(
-          m + 1, row_mapB, b_xadj_v);
-
-      if (verbose)
-        std::cout << "MKL COPY size type to int TIME:" << copy_time.seconds()
-                  << std::endl;
+  typedef typename Kokkos::View<int *, Kokkos::HostSpace> int_tmp_view_t;
+
+ public:
+  static void mkl_symbolic(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n,
+                           nnz_lno_t k, a_rowmap_view_type row_mapA,
+                           a_index_view_type entriesA, bool transposeA,
+                           b_rowmap_view_type row_mapB,
+                           b_index_view_type entriesB, bool transposeB,
+                           c_rowmap_view_type row_mapC, bool verbose = false) {
+    if (m < 1 || n < 1 || k < 1 || entriesA.extent(0) < 1 ||
+        entriesB.extent(0) < 1) {
+      // set correct values in non-empty 0-nnz corner case
+      handle->set_c_nnz(0);
+      Kokkos::deep_copy(row_mapC, 0);
+      return;
     }
 
-    int *a_adj = (int *)entriesA.data();
-    int *b_adj = (int *)entriesB.data();
-
-    std::vector<value_type> tmp_values(
-        KOKKOSKERNELS_MACRO_MAX(entriesB.extent(0), entriesA.extent(0)));
-    value_type *ptmp_values = &(tmp_values[0]);
-    value_type *a_ew        = ptmp_values;
-    value_type *b_ew        = ptmp_values;
-
-    sparse_matrix_t A;
-    sparse_matrix_t B;
-    sparse_matrix_t C;
-
-    if (std::is_same<value_type, float>::value) {
-      if (SPARSE_STATUS_SUCCESS !=
-          mkl_sparse_s_create_csr(&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj,
-                                  a_xadj + 1, a_adj, (float *)a_ew)) {
-        throw std::runtime_error(
-            "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
-        return;
-      }
-
-      if (SPARSE_STATUS_SUCCESS !=
-          mkl_sparse_s_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj,
-                                  b_xadj + 1, b_adj, (float *)b_ew)) {
-        throw std::runtime_error(
-            "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
-        return;
-      }
-
-      sparse_operation_t operation;
-      if (transposeA && transposeB) {
-        operation = SPARSE_OPERATION_TRANSPOSE;
-      } else if (!(transposeA || transposeB)) {
-        operation = SPARSE_OPERATION_NON_TRANSPOSE;
-      } else {
-        throw std::runtime_error(
-            "MKL either transpose both matrices, or none for SPGEMM\n");
-        return;
-      }
-
-      Kokkos::Timer timer1;
-      bool success =
-          SPARSE_STATUS_SUCCESS != mkl_sparse_spmm(operation, A, B, &C);
-      if (verbose)
-        std::cout << "Actual FLOAT MKL SPMM Time in symbolic:"
-                  << timer1.seconds() << std::endl;
+    Kokkos::Timer timer;
+    using scalar_t = typename KernelHandle::nnz_scalar_t;
 
-      if (success) {
-        throw std::runtime_error(
-            "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n");
-
-        return;
-      } else {
-        sparse_index_base_t c_indexing;
-        MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns;
-        float *values;
-
-        if (SPARSE_STATUS_SUCCESS !=
-            mkl_sparse_s_export_csr(C, &c_indexing, &c_rows, &c_cols,
-                                    &rows_start, &rows_end, &columns,
-                                    &values)) {
-          throw std::runtime_error(
-              "ERROR at exporting result matrix in mkl_sparse_spmm\n");
-          return;
-        }
-
-        if (SPARSE_INDEX_BASE_ZERO != c_indexing) {
-          throw std::runtime_error("C is not zero based indexed\n");
-          return;
-        }
-
-        KokkosKernels::Impl::copy_vector<
-            MKL_INT *, typename cin_row_index_view_type::non_const_type,
-            MyExecSpace>(m, rows_start, row_mapC);
-        idx nnz = row_mapC(m) = rows_end[m - 1];
+    const auto export_rowmap = [&](MKL_INT num_rows, MKL_INT *rows_start,
+                                   MKL_INT * /*columns*/,
+                                   scalar_t * /*values*/) {
+      if (handle->mkl_keep_output) {
+        Kokkos::Timer copy_time;
+        const nnz_lno_t nnz = rows_start[num_rows];
         handle->set_c_nnz(nnz);
+        copy(make_host_view(rows_start, num_rows + 1), row_mapC);
+        if (verbose)
+          std::cout << "\tMKL rowmap export time:" << copy_time.seconds()
+                    << std::endl;
       }
+    };
 
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) {
-        throw std::runtime_error("Error at mkl_sparse_destroy A\n");
-        return;
-      }
-
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) {
-        throw std::runtime_error("Error at mkl_sparse_destroy B\n");
-        return;
-      }
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) {
-        throw std::runtime_error("Error at mkl_sparse_destroy C\n");
-        return;
-      }
-    } else if (std::is_same<value_type, double>::value) {
-      /*
-      std::cout << "create a" << std::endl;
-      std::cout << "m:" << m << " n:" << n << std::endl;
-      std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] <<
-      std::endl; std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] <<
-      " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl;
-      */
-      if (SPARSE_STATUS_SUCCESS !=
-          mkl_sparse_d_create_csr(&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj,
-                                  a_xadj + 1, a_adj, (double *)a_ew)) {
-        throw std::runtime_error(
-            "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
-        return;
-      }
-
-      // std::cout << "create b" << std::endl;
-      if (SPARSE_STATUS_SUCCESS !=
-          mkl_sparse_d_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj,
-                                  b_xadj + 1, b_adj, (double *)b_ew)) {
-        throw std::runtime_error(
-            "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
-        return;
-      }
+    // use dummy values for A and B inputs
+    a_values_view_type tmp_valsA(
+        Kokkos::ViewAllocateWithoutInitializing("tmp_valuesA"),
+        entriesA.extent(0));
+    b_values_view_type tmp_valsB(
+        Kokkos::ViewAllocateWithoutInitializing("tmp_valuesB"),
+        entriesB.extent(0));
 
-      sparse_operation_t operation;
-      if (transposeA && transposeB) {
-        operation = SPARSE_OPERATION_TRANSPOSE;
-      } else if (!(transposeA || transposeB)) {
-        operation = SPARSE_OPERATION_NON_TRANSPOSE;
-      } else {
-        throw std::runtime_error(
-            "MKL either transpose both matrices, or none for SPGEMM\n");
-        return;
-      }
+    spmm(handle, m, n, k, row_mapA, entriesA, tmp_valsA, transposeA, row_mapB,
+         entriesB, tmp_valsB, transposeB, verbose, export_rowmap);
 
-      Kokkos::Timer timer1;
-      bool success =
-          SPARSE_STATUS_SUCCESS != mkl_sparse_spmm(operation, A, B, &C);
-      if (verbose)
-        std::cout << "Actual DOUBLE MKL SPMM Time Without Free:"
-                  << timer1.seconds() << std::endl;
-      mkl_free_buffers();
-      if (verbose)
-        std::cout << "Actual DOUBLE MKL SPMM Time:" << timer1.seconds()
-                  << std::endl;
-
-      if (success) {
-        throw std::runtime_error(
-            "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n");
-        return;
-      } else {
-        sparse_index_base_t c_indexing;
-        MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns;
-        double *values;
-
-        if (SPARSE_STATUS_SUCCESS !=
-            mkl_sparse_d_export_csr(C, &c_indexing, &c_rows, &c_cols,
-                                    &rows_start, &rows_end, &columns,
-                                    &values)) {
-          throw std::runtime_error(
-              "ERROR at exporting result matrix in mkl_sparse_spmm\n");
-          return;
-        }
-
-        if (SPARSE_INDEX_BASE_ZERO != c_indexing) {
-          throw std::runtime_error("C is not zero based indexed\n");
-          return;
-        }
-        if (handle->mkl_keep_output) {
-          Kokkos::Timer copy_time;
-
-          KokkosKernels::Impl::copy_vector<
-              MKL_INT *, typename cin_row_index_view_type::non_const_type,
-              MyExecSpace>(m, rows_start, row_mapC);
-          idx nnz = row_mapC(m) = rows_end[m - 1];
-          handle->set_c_nnz(nnz);
-
-          double copy_time_d = copy_time.seconds();
-          if (verbose) std::cout << "MKL COPYTIME:" << copy_time_d << std::endl;
-        }
-      }
+    if (verbose)
+      std::cout << "MKL symbolic time:" << timer.seconds() << std::endl;
+  }
 
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) {
-        throw std::runtime_error("Error at mkl_sparse_destroy A\n");
-        return;
-      }
+  static void mkl_numeric(
+      KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k,
+      a_rowmap_view_type row_mapA, a_index_view_type entriesA,
+      a_values_view_type valuesA, bool transposeA, b_rowmap_view_type row_mapB,
+      b_index_view_type entriesB, b_values_view_type valuesB, bool transposeB,
+      c_rowmap_view_type /* row_mapC */, c_index_view_type entriesC,
+      c_values_view_type valuesC, bool verbose = false) {
+    Kokkos::Timer timer;
+
+    const auto export_values =
+        [&](MKL_INT num_rows, MKL_INT *rows_start, MKL_INT *columns,
+            typename KernelHandle::nnz_scalar_t *values) {
+          if (handle->mkl_keep_output) {
+            Kokkos::Timer copy_time;
+            const nnz_lno_t nnz = rows_start[num_rows];
+            copy(make_host_view(columns, nnz), entriesC);
+            copy(make_host_view(values, nnz), valuesC);
+            if (verbose)
+              std::cout << "\tMKL values export time:" << copy_time.seconds()
+                        << std::endl;
+          }
+        };
+
+    spmm(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB,
+         entriesB, valuesB, transposeB, verbose, export_values);
+
+    if (verbose)
+      std::cout << "MKL numeric time:" << timer.seconds() << std::endl;
+  }
 
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) {
-        throw std::runtime_error("Error at mkl_sparse_destroy B\n");
-        return;
-      }
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) {
-        throw std::runtime_error("Error at mkl_sparse_destroy C\n");
-        return;
-      }
+ private:
+  template <typename CB>
+  static void spmm(KernelHandle * /* handle */, nnz_lno_t m, nnz_lno_t n,
+                   nnz_lno_t k, a_rowmap_view_type row_mapA,
+                   a_index_view_type entriesA, a_values_view_type valuesA,
+
+                   bool transposeA, b_rowmap_view_type row_mapB,
+                   b_index_view_type entriesB, b_values_view_type valuesB,
+                   bool transposeB, bool verbose, const CB &callback) {
+    if (!std::is_same<nnz_lno_t, int>::value) {
+      throw std::runtime_error("MKL requires local ordinals to be integer.\n");
+    }
 
-    } else {
-      throw std::runtime_error(
-          "MKL requires float or double values. Complex values are not "
-          "implemented yet.\n");
+    if (m < 1 || n < 1 || k < 1 || entriesA.extent(0) < 1 ||
+        entriesB.extent(0) < 1) {
       return;
     }
-  } else {
-    throw std::runtime_error("MKL requires local ordinals to be integer.\n");
-    return;
-  }
-#else
-  (void)handle;
-  (void)m;
-  (void)n;
-  (void)k;
-  (void)row_mapA;
-  (void)row_mapB;
-  (void)row_mapC;
-  (void)entriesA;
-  (void)entriesB;
-  (void)transposeA;
-  (void)transposeB;
-  (void)verbose;
-  throw std::runtime_error("MKL IS NOT DEFINED\n");
-  // return;
-#endif
-}
 
-template <
-    typename KernelHandle, typename in_row_index_view_type,
-    typename in_nonzero_index_view_type, typename in_nonzero_value_view_type,
-    typename bin_row_index_view_type, typename bin_nonzero_index_view_type,
-    typename bin_nonzero_value_view_type, typename cin_row_index_view_type,
-    typename cin_nonzero_index_view_type, typename cin_nonzero_value_view_type>
-void mkl_apply(KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
-               typename KernelHandle::nnz_lno_t n,
-               typename KernelHandle::nnz_lno_t k,
-               in_row_index_view_type row_mapA,
-               in_nonzero_index_view_type entriesA,
-               in_nonzero_value_view_type valuesA,
-
-               bool transposeA, bin_row_index_view_type row_mapB,
-               bin_nonzero_index_view_type entriesB,
-               bin_nonzero_value_view_type valuesB, bool transposeB,
-               cin_row_index_view_type row_mapC,
-               cin_nonzero_index_view_type entriesC,
-               cin_nonzero_value_view_type valuesC, bool verbose = false) {
-#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+    const auto create_mirror = [](auto view) {
+      return Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), view);
+    };
 
-  typedef typename KernelHandle::nnz_lno_t idx;
-  typedef typename KernelHandle::size_type size_type;
+    auto h_rowsA      = create_mirror(row_mapA);
+    auto h_rowsB      = create_mirror(row_mapB);
+    const int *a_xadj = reinterpret_cast<const int *>(h_rowsA.data());
+    const int *b_xadj = reinterpret_cast<const int *>(h_rowsB.data());
+    int_tmp_view_t a_xadj_v, b_xadj_v;
 
-  typedef typename KernelHandle::HandleTempMemorySpace HandleTempMemorySpace;
-  typedef typename Kokkos::View<int *, HandleTempMemorySpace>
-      int_temp_work_view_t;
-
-  typedef typename KernelHandle::nnz_scalar_t value_type;
-
-  typedef typename KernelHandle::HandleExecSpace MyExecSpace;
-  /*
-      if (!(
-          (Kokkos::SpaceAccessibility<typename
-     Kokkos::HostSpace::execution_space, typename
-     device1::memory_space>::accessible) &&
-          (Kokkos::SpaceAccessibility<typename
-     Kokkos::HostSpace::execution_space, typename
-     device2::memory_space>::accessible) &&
-          (Kokkos::SpaceAccessibility<typename
-     Kokkos::HostSpace::execution_space, typename
-     device3::memory_space>::accessible) )
-          ){
-        throw std::runtime_error ("MEMORY IS NOT ALLOCATED IN HOST DEVICE for
-     MKL\n"); return;
-      }
-  */
-  if (std::is_same<idx, int>::value) {
-    int *a_xadj = NULL;
-    int *b_xadj = NULL;
-    int_temp_work_view_t a_xadj_v, b_xadj_v;
-
-    if (std::is_same<size_type, int>::value) {
-      a_xadj = (int *)row_mapA.data();
-      b_xadj = (int *)row_mapB.data();
-    } else {
-      // TODO test this case.
-
-      Kokkos::Timer copy_time;
-      const int max_integer = 2147483647;
-      if (entriesB.extent(0) > max_integer ||
-          entriesA.extent(0) > max_integer) {
+    if (!std::is_same<size_type, int>::value) {
+      if (entriesA.extent(0) > INT_MAX || entriesB.extent(0) > INT_MAX) {
         throw std::runtime_error(
-            "MKL requires integer values for size type for SPGEMM. Copying to "
+            "MKL requires integer values for size type for SPGEMM. Copying "
+            "to "
             "integer will cause overflow.\n");
-        return;
       }
-      a_xadj_v = int_temp_work_view_t("tmpa", m + 1);
-      a_xadj   = (int *)a_xadj_v.data();
-      b_xadj_v = int_temp_work_view_t("tmpb", n + 1);
-      b_xadj   = (int *)b_xadj_v.data();
-
-      KokkosKernels::Impl::copy_vector<in_row_index_view_type,
-                                       int_temp_work_view_t, MyExecSpace>(
-          m + 1, row_mapA, a_xadj_v);
-
-      KokkosKernels::Impl::copy_vector<bin_row_index_view_type,
-                                       int_temp_work_view_t, MyExecSpace>(
-          m + 1, row_mapB, b_xadj_v);
+      static_assert(
+          std::is_same<typename int_tmp_view_t::value_type,
+                       typename int_tmp_view_t::non_const_value_type>::value,
+          "deep_copy requires non-const destination type");
 
+      Kokkos::Timer copy_time;
+      a_xadj_v = int_tmp_view_t("tmpa", m + 1);
+      b_xadj_v = int_tmp_view_t("tmpb", n + 1);
+      Kokkos::deep_copy(a_xadj_v, h_rowsA);
+      Kokkos::deep_copy(b_xadj_v, h_rowsB);
+      a_xadj = (int *)a_xadj_v.data();
+      b_xadj = (int *)b_xadj_v.data();
       if (verbose)
-        std::cout << "MKL COPY size type to int TIME:" << copy_time.seconds()
-                  << std::endl;
+        std::cout << "\tMKL int-type temp rowmap copy time:"
+                  << copy_time.seconds() << std::endl;
     }
 
-    int *a_adj = (int *)entriesA.data();
-    int *b_adj = (int *)entriesB.data();
-
-    const value_type *a_ew = valuesA.data();
-    const value_type *b_ew = valuesB.data();
-
-    sparse_matrix_t A;
-    sparse_matrix_t B;
-    sparse_matrix_t C;
-
-    if (std::is_same<value_type, float>::value) {
-      if (SPARSE_STATUS_SUCCESS !=
-          mkl_sparse_s_create_csr(&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj,
-                                  a_xadj + 1, a_adj, (float *)a_ew)) {
-        throw std::runtime_error(
-            "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
-        return;
-      }
-
-      if (SPARSE_STATUS_SUCCESS !=
-          mkl_sparse_s_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj,
-                                  b_xadj + 1, b_adj, (float *)b_ew)) {
-        throw std::runtime_error(
-            "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
-        return;
-      }
-
-      sparse_operation_t operation;
-      if (transposeA && transposeB) {
-        operation = SPARSE_OPERATION_TRANSPOSE;
-      } else if (!(transposeA || transposeB)) {
-        operation = SPARSE_OPERATION_NON_TRANSPOSE;
-      } else {
-        throw std::runtime_error(
-            "MKL either transpose both matrices, or none for SPGEMM\n");
-        return;
-      }
-
-      Kokkos::Timer timer1;
-      bool success =
-          SPARSE_STATUS_SUCCESS != mkl_sparse_spmm(operation, A, B, &C);
-      if (verbose)
-        std::cout << "Actual FLOAT MKL SPMM Time:" << timer1.seconds()
-                  << std::endl;
-
-      if (success) {
-        throw std::runtime_error(
-            "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n");
-
-        return;
-      } else {
-        sparse_index_base_t c_indexing;
-        MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns;
-        float *values;
-
-        if (SPARSE_STATUS_SUCCESS !=
-            mkl_sparse_s_export_csr(C, &c_indexing, &c_rows, &c_cols,
-                                    &rows_start, &rows_end, &columns,
-                                    &values)) {
-          throw std::runtime_error(
-              "ERROR at exporting result matrix in mkl_sparse_spmm\n");
-          return;
-        }
-
-        if (SPARSE_INDEX_BASE_ZERO != c_indexing) {
-          throw std::runtime_error("C is not zero based indexed\n");
-          return;
-        }
-
-        // KokkosKernels::Impl::copy_vector<MKL_INT *, typename
-        // cin_row_index_view_type::non_const_type, MyExecSpace> (m, rows_start,
-        // row_mapC); idx nnz = row_mapC(m) = rows_end[m - 1];
-        idx nnz = rows_end[m - 1];
-        using non_const_size_type =
-            typename cin_row_index_view_type::non_const_value_type;
-        auto *tmpPtr = const_cast<non_const_size_type *>(row_mapC.data());
-        tmpPtr[m]    = nnz;
-
-        KokkosKernels::Impl::copy_vector<
-            MKL_INT *, typename cin_nonzero_index_view_type::non_const_type,
-            MyExecSpace>(nnz, columns, entriesC);
-        KokkosKernels::Impl::copy_vector<
-            float *, typename cin_nonzero_value_view_type::non_const_type,
-            MyExecSpace>(nnz, values, valuesC);
-      }
-
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) {
-        throw std::runtime_error("Error at mkl_sparse_destroy A\n");
-        return;
-      }
-
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) {
-        throw std::runtime_error("Error at mkl_sparse_destroy B\n");
-        return;
-      }
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) {
-        throw std::runtime_error("Error at mkl_sparse_destroy C\n");
-        return;
-      }
-    } else if (std::is_same<value_type, double>::value) {
-      /*
-      std::cout << "create a" << std::endl;
-      std::cout << "m:" << m << " n:" << n << std::endl;
-      std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] <<
-      std::endl; std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] <<
-      " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl;
-      */
-      if (SPARSE_STATUS_SUCCESS !=
-          mkl_sparse_d_create_csr(&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj,
-                                  a_xadj + 1, a_adj, (double *)a_ew)) {
-        throw std::runtime_error(
-            "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
-        return;
-      }
-
-      // std::cout << "create b" << std::endl;
-      if (SPARSE_STATUS_SUCCESS !=
-          mkl_sparse_d_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj,
-                                  b_xadj + 1, b_adj, (double *)b_ew)) {
-        throw std::runtime_error(
-            "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
-        return;
-      }
-
-      sparse_operation_t operation;
-      if (transposeA && transposeB) {
-        operation = SPARSE_OPERATION_TRANSPOSE;
-      } else if (!(transposeA || transposeB)) {
-        operation = SPARSE_OPERATION_NON_TRANSPOSE;
-      } else {
-        throw std::runtime_error(
-            "MKL either transpose both matrices, or none for SPGEMM\n");
-        return;
-      }
-
-      Kokkos::Timer timer1;
-      bool success =
-          SPARSE_STATUS_SUCCESS != mkl_sparse_spmm(operation, A, B, &C);
-      if (verbose)
-        std::cout << "Actual DOUBLE MKL SPMM Time Without Free:"
-                  << timer1.seconds() << std::endl;
+    auto h_valsA           = create_mirror(valuesA);
+    auto h_valsB           = create_mirror(valuesB);
+    auto h_entriesA        = create_mirror(entriesA);
+    auto h_entriesB        = create_mirror(entriesB);
+    const int *a_adj       = reinterpret_cast<const int *>(h_entriesA.data());
+    const int *b_adj       = reinterpret_cast<const int *>(h_entriesB.data());
+    const value_type *a_ew = h_valsA.data();
+    const value_type *b_ew = h_valsB.data();
+
+    // Hack: we discard const with pointer casts here to work around MKL
+    // requiring mutable input and our symbolic interface not providing it
+    using Matrix = MKLSparseMatrix<value_type>;
+    Matrix A(m, n, (int *)a_xadj, (int *)a_adj, (value_type *)a_ew);
+    Matrix B(n, k, (int *)b_xadj, (int *)b_adj, (value_type *)b_ew);
+
+    sparse_operation_t operation;
+    if (transposeA && transposeB) {
+      operation = SPARSE_OPERATION_TRANSPOSE;
+    } else if (!(transposeA || transposeB)) {
+      operation = SPARSE_OPERATION_NON_TRANSPOSE;
+    } else {
+      throw std::runtime_error(
+          "MKL either transpose both matrices, or none for SPGEMM\n");
+    }
 
-      mkl_free_buffers();
-      if (verbose)
-        std::cout << "Actual DOUBLE MKL SPMM Time:" << timer1.seconds()
-                  << std::endl;
+    Kokkos::Timer timer1;
+    Matrix C = mkl_spmm(operation, A, B);
+    if (verbose) {
+      std::cout << "\tMKL spmm (";
+      if (std::is_same<float, value_type>::value)
+        std::cout << "FLOAT";
+      else if (std::is_same<double, value_type>::value)
+        std::cout << "DOUBLE";
+      else
+        std::cout << "?";
+      std::cout << ") time:" << timer1.seconds() << std::endl;
+    }
 
-      if (success) {
-        throw std::runtime_error(
-            "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n");
-        return;
-      } else {
-        sparse_index_base_t c_indexing;
-        MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns;
-        double *values;
-
-        if (SPARSE_STATUS_SUCCESS !=
-            mkl_sparse_d_export_csr(C, &c_indexing, &c_rows, &c_cols,
-                                    &rows_start, &rows_end, &columns,
-                                    &values)) {
-          throw std::runtime_error(
-              "ERROR at exporting result matrix in mkl_sparse_spmm\n");
-          return;
-        }
-
-        if (SPARSE_INDEX_BASE_ZERO != c_indexing) {
-          throw std::runtime_error("C is not zero based indexed\n");
-          return;
-        }
-        if (handle->mkl_keep_output) {
-          Kokkos::Timer copy_time;
-
-          // KokkosKernels::Impl::copy_vector<MKL_INT *, typename
-          // cin_row_index_view_type::non_const_type, MyExecSpace> (m,
-          // rows_start, row_mapC); idx nnz = row_mapC(m) = rows_end[m - 1];
-          idx nnz = rows_end[m - 1];
-          using non_const_size_type =
-              typename cin_row_index_view_type::non_const_value_type;
-          auto *tmpPtr = const_cast<non_const_size_type *>(row_mapC.data());
-          tmpPtr[m]    = nnz;
-
-          KokkosKernels::Impl::copy_vector<
-              MKL_INT *, typename cin_nonzero_index_view_type::non_const_type,
-              MyExecSpace>(nnz, columns, entriesC);
-          KokkosKernels::Impl::copy_vector<
-              double *, typename cin_nonzero_value_view_type::non_const_type,
-              MyExecSpace>(nnz, values, valuesC);
-          double copy_time_d = copy_time.seconds();
-          if (verbose) std::cout << "MKL COPYTIME:" << copy_time_d << std::endl;
-        }
-      }
+    MKL_INT num_rows, num_cols, *rows_start, *columns;
+    value_type *values;
+    C.export_data(num_rows, num_cols, rows_start, columns, values);
+    callback(m, rows_start, columns, values);
 
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) {
-        throw std::runtime_error("Error at mkl_sparse_destroy A\n");
-        return;
-      }
+    A.destroy();
+    B.destroy();
+    C.destroy();
+  }
 
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) {
-        throw std::runtime_error("Error at mkl_sparse_destroy B\n");
-        return;
-      }
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) {
-        throw std::runtime_error("Error at mkl_sparse_destroy C\n");
-        return;
-      }
+  template <typename from_view_type, typename dst_view_type>
+  inline static void copy(from_view_type from, dst_view_type to) {
+    auto h_from =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), from);
+    auto h_to = Kokkos::create_mirror_view(Kokkos::HostSpace(), to);
+    Kokkos::deep_copy(h_to, h_from);  // view copy (for different element types)
+    Kokkos::deep_copy(to, h_to);
+  }
 
-    } else {
-      throw std::runtime_error(
-          "MKL requires float or double values. Complex values are not "
-          "implemented yet.\n");
-      return;
-    }
-  } else {
-    throw std::runtime_error("MKL requires local ordinals to be integer.\n");
-    return;
+  template <typename T,
+            typename view_type = Kokkos::View<const T *, Kokkos::HostSpace>>
+  inline static view_type make_host_view(const T *data, size_t num_elems) {
+    return view_type(data, num_elems);
   }
-#else
-  (void)handle;
-  (void)m;
-  (void)n;
-  (void)k;
-  (void)row_mapA;
-  (void)row_mapB;
-  (void)row_mapC;
-  (void)entriesA;
-  (void)entriesB;
-  (void)entriesC;
-  (void)valuesA;
-  (void)valuesB;
-  (void)valuesC;
-  (void)transposeA;
-  (void)transposeB;
-  (void)verbose;
-  throw std::runtime_error("MKL IS NOT DEFINED\n");
-  // return;
-#endif
+};
+
+template <typename KernelHandle, typename a_rowmap_type, typename a_index_type,
+          typename b_rowmap_type, typename b_index_type, typename c_rowmap_type,
+          typename nnz_lno_t = typename KernelHandle::nnz_lno_t>
+void mkl_symbolic(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k,
+                  a_rowmap_type row_mapA, a_index_type entriesA,
+                  bool transposeA, b_rowmap_type row_mapB,
+                  b_index_type entriesB, bool transposeB,
+                  c_rowmap_type row_mapC, bool verbose = false) {
+  using values_type  = typename KernelHandle::scalar_temp_work_view_t;
+  using c_index_type = b_index_type;
+  using mkl = MKL_SPGEMM<KernelHandle, a_rowmap_type, a_index_type, values_type,
+                         b_rowmap_type, b_index_type, values_type,
+                         c_rowmap_type, c_index_type, values_type>;
+  mkl::mkl_symbolic(handle, m, n, k, row_mapA, entriesA, transposeA, row_mapB,
+                    entriesB, transposeB, row_mapC, verbose);
 }
+
+template <typename KernelHandle, typename a_rowmap_type, typename a_index_type,
+          typename a_values_type, typename b_rowmap_type, typename b_index_type,
+          typename b_values_type, typename c_rowmap_type, typename c_index_type,
+          typename c_values_type,
+          typename nnz_lno_t = typename KernelHandle::nnz_lno_t>
+void mkl_numeric(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k,
+                 a_rowmap_type row_mapA, a_index_type entriesA,
+                 a_values_type valuesA, bool transposeA, b_rowmap_type row_mapB,
+                 b_index_type entriesB, b_values_type valuesB, bool transposeB,
+                 c_rowmap_type row_mapC, c_index_type entriesC,
+                 c_values_type valuesC, bool verbose = false) {
+  using mkl =
+      MKL_SPGEMM<KernelHandle, a_rowmap_type, a_index_type, a_values_type,
+                 b_rowmap_type, b_index_type, b_values_type, c_rowmap_type,
+                 c_index_type, c_values_type>;
+  mkl::mkl_numeric(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA,
+                   row_mapB, entriesB, valuesB, transposeB, row_mapC, entriesC,
+                   valuesC, verbose);
+}
+
 }  // namespace Impl
 }  // namespace KokkosSparse
 
-#endif
+#endif  // KOKKOSKERNELS_ENABLE_TPL_MKL
+#endif  // _KOKKOSSPGEMMMKL_HPP
diff --git a/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp
index beb969fc77..24008d3b26 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp
@@ -211,23 +211,6 @@ struct SPGEMM_NUMERIC<
     if (!sh->is_symbolic_called()) {
       throw std::runtime_error(
           "Call spgemm symbolic before calling SpGEMM numeric");
-      /*
-      KokkosSparse::Experimental::spgemm_symbolic<KernelHandle,
-                    a_size_view_t_, a_lno_view_t,
-                    b_size_view_t_, b_lno_view_t,
-                    c_size_view_t_>(
-          handle, m, n, k,
-          row_mapA, entriesA, transposeA,
-          row_mapB, entriesB, transposeB,
-          row_mapC
-          );
-      typename c_size_view_t_::value_type c_nnz_size =
-      handle->get_spgemm_handle()->get_c_nnz(); if (c_nnz_size){ entriesC =
-      c_lno_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"),
-      c_nnz_size); valuesC = c_scalar_view_t
-      (Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size);
-      }
-      */
     }
 
     switch (sh->get_algorithm_type()) {
@@ -245,9 +228,13 @@ struct SPGEMM_NUMERIC<
                                     transposeB, row_mapC, entriesC, valuesC);
         break;
       case SPGEMM_MKL:
-        mkl_apply(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA,
-                  row_mapB, entriesB, valuesB, transposeB, row_mapC, entriesC,
-                  valuesC, handle->get_verbose());
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+        mkl_numeric(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA,
+                    row_mapB, entriesB, valuesB, transposeB, row_mapC, entriesC,
+                    valuesC, handle->get_verbose());
+#else
+        throw std::runtime_error("MKL was not enabled in this build!");
+#endif
         break;
       case SPGEMM_MKL2PHASE:
         mkl2phase_apply(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA,
diff --git a/src/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp b/src/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp
index 181984ebe9..d83ae6767c 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp
@@ -179,9 +179,13 @@ struct SPGEMM_SYMBOLIC<KernelHandle, a_size_view_t_, a_lno_view_t,
                               row_mapC);
         break;
       case SPGEMM_MKL:
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
         mkl_symbolic(sh, m, n, k, row_mapA, entriesA, transposeA, row_mapB,
                      entriesB, transposeB, row_mapC, handle->get_verbose());
         break;
+#else
+        throw std::runtime_error("MKL was not enabled in this build!");
+#endif
     }
     sh->set_call_symbolic();
   }
diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp
index d0b80ace69..4af8606dfb 100644
--- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp
@@ -242,52 +242,54 @@ struct ILUKLvlSchedTP1NumericFunctor {
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const member_type &team) const {
-    auto my_league = team.league_rank();  // map to rowid
-    auto rowid     = level_idx(my_league + lev_start);
-    auto my_team   = team.team_rank();
+    nnz_lno_t my_team = static_cast<nnz_lno_t>(team.league_rank());
+    nnz_lno_t rowid =
+        static_cast<nnz_lno_t>(level_idx(my_team + lev_start));  // map to rowid
 
-    auto k1 = L_row_map(rowid);
-    auto k2 = L_row_map(rowid + 1);
+    size_type k1 = static_cast<size_type>(L_row_map(rowid));
+    size_type k2 = static_cast<size_type>(L_row_map(rowid + 1));
 #ifdef KEEP_DIAG
     Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1),
                          [&](const size_type k) {
-                           auto col           = L_entries(k);
-                           L_values(k)        = 0.0;
-                           iw(my_league, col) = k;
+                           nnz_lno_t col = static_cast<nnz_lno_t>(L_entries(k));
+                           L_values(k)   = 0.0;
+                           iw(my_team, col) = static_cast<nnz_lno_t>(k);
                          });
 #else
     Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2),
                          [&](const size_type k) {
-                           auto col           = L_entries(k);
-                           L_values(k)        = 0.0;
-                           iw(my_league, col) = k;
+                           nnz_lno_t col = static_cast<nnz_lno_t>(L_entries(k));
+                           L_values(k)   = 0.0;
+                           iw(my_team, col) = static_cast<nnz_lno_t>(k);
                          });
 #endif
 
 #ifdef KEEP_DIAG
-    if (my_team == 0) L_values(k2 - 1) = scalar_t(1.0);
+    // if (my_thread == 0) L_values(k2 - 1) = scalar_t(1.0);
+    Kokkos::single(Kokkos::PerTeam(team),
+                   [&]() { L_values(k2 - 1) = scalar_t(1.0); });
 #endif
 
     team.team_barrier();
 
-    k1 = U_row_map(rowid);
-    k2 = U_row_map(rowid + 1);
+    k1 = static_cast<size_type>(U_row_map(rowid));
+    k2 = static_cast<size_type>(U_row_map(rowid + 1));
     Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2),
                          [&](const size_type k) {
-                           auto col           = U_entries(k);
-                           U_values(k)        = 0.0;
-                           iw(my_league, col) = k;
+                           nnz_lno_t col = static_cast<nnz_lno_t>(U_entries(k));
+                           U_values(k)   = 0.0;
+                           iw(my_team, col) = static_cast<nnz_lno_t>(k);
                          });
 
     team.team_barrier();
 
     // Unpack the ith row of A
-    k1 = A_row_map(rowid);
-    k2 = A_row_map(rowid + 1);
+    k1 = static_cast<size_type>(A_row_map(rowid));
+    k2 = static_cast<size_type>(A_row_map(rowid + 1));
     Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2),
                          [&](const size_type k) {
-                           auto col  = A_entries(k);
-                           auto ipos = iw(my_league, col);
+                           nnz_lno_t col = static_cast<nnz_lno_t>(A_entries(k));
+                           nnz_lno_t ipos = iw(my_team, col);
                            if (col < rowid)
                              L_values(ipos) = A_values(k);
                            else
@@ -297,20 +299,22 @@ struct ILUKLvlSchedTP1NumericFunctor {
     team.team_barrier();
 
     // Eliminate prev rows
-    k1 = L_row_map(rowid);
-    k2 = L_row_map(rowid + 1);
+    k1 = static_cast<size_type>(L_row_map(rowid));
+    k2 = static_cast<size_type>(L_row_map(rowid + 1));
 #ifdef KEEP_DIAG
-    for (auto k = k1; k < k2 - 1; ++k) {
+    for (size_type k = k1; k < k2 - 1; k++)
 #else
-    for (auto k = k1; k < k2; ++k) {
+    for (size_type k = k1; k < k2; k++)
 #endif
-      auto prev_row = L_entries(k);
+    {
+      nnz_lno_t prev_row = L_entries(k);
 #ifdef KEEP_DIAG
-      auto fact = L_values(k) / U_values(U_row_map(prev_row));
+      scalar_t fact = L_values(k) / U_values(U_row_map(prev_row));
 #else
-      auto fact = L_values(k) * U_values(U_row_map(prev_row));
+      scalar_t fact = L_values(k) * U_values(U_row_map(prev_row));
 #endif
-      if (my_team == 0) L_values(k) = fact;
+      // if (my_thread == 0) L_values(k) = fact;
+      Kokkos::single(Kokkos::PerTeam(team), [&]() { L_values(k) = fact; });
 
       team.team_barrier();
 
@@ -318,10 +322,10 @@ struct ILUKLvlSchedTP1NumericFunctor {
           Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1,
                                   U_row_map(prev_row + 1)),
           [&](const size_type kk) {
-            auto col  = U_entries(kk);
-            auto ipos = iw(my_league, col);
+            nnz_lno_t col  = static_cast<nnz_lno_t>(U_entries(kk));
+            nnz_lno_t ipos = iw(my_team, col);
+            auto lxu       = -U_values(kk) * fact;
             if (ipos != -1) {
-              auto lxu = -U_values(kk) * fact;
               if (col < rowid)
                 Kokkos::atomic_add(&L_values(ipos), lxu);
               else
@@ -332,40 +336,49 @@ struct ILUKLvlSchedTP1NumericFunctor {
       team.team_barrier();
     }  // end for k
 
-    if (my_team == 0) {
+    // if (my_thread == 0) {
+    Kokkos::single(Kokkos::PerTeam(team), [&]() {
+      nnz_lno_t ipos = iw(my_team, rowid);
 #ifdef KEEP_DIAG
-      if (U_values(iw(my_league, rowid)) == 0.0) {
-        U_values(iw(my_league, rowid)) = 1e6;
+      if (U_values(ipos) == 0.0) {
+        U_values(ipos) = 1e6;
       }
 #else
-      if (U_values(iw(my_league, rowid)) == 0.0) {
-        U_values(iw(my_league, rowid)) = 1e6;
+      if (U_values(ipos) == 0.0) {
+        U_values(ipos) = 1e6;
       } else {
-        U_values(iw(my_league, rowid)) = 1.0 / U_values(iw(my_league, rowid));
+        U_values(ipos) = 1.0 / U_values(ipos);
       }
 #endif
-    }
+    });
+    //}
 
     team.team_barrier();
 
     // Reset
-    k1 = L_row_map(rowid);
-    k2 = L_row_map(rowid + 1);
+    k1 = static_cast<size_type>(L_row_map(rowid));
+    k2 = static_cast<size_type>(L_row_map(rowid + 1));
 #ifdef KEEP_DIAG
-    Kokkos::parallel_for(
-        Kokkos::TeamThreadRange(team, k1, k2 - 1),
-        [&](const size_type k) { iw(my_league, L_entries(k)) = -1; });
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1),
+                         [&](const size_type k) {
+                           nnz_lno_t col = static_cast<nnz_lno_t>(L_entries(k));
+                           iw(my_team, col) = -1;
+                         });
 #else
-    Kokkos::parallel_for(
-        Kokkos::TeamThreadRange(team, k1, k2),
-        [&](const size_type k) { iw(my_league, L_entries(k)) = -1; });
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2),
+                         [&](const size_type k) {
+                           nnz_lno_t col = static_cast<nnz_lno_t>(L_entries(k));
+                           iw(my_team, col) = -1;
+                         });
 #endif
 
-    k1 = U_row_map(rowid);
-    k2 = U_row_map(rowid + 1);
-    Kokkos::parallel_for(
-        Kokkos::TeamThreadRange(team, k1, k2),
-        [&](const size_type k) { iw(my_league, U_entries(k)) = -1; });
+    k1 = static_cast<size_type>(U_row_map(rowid));
+    k2 = static_cast<size_type>(U_row_map(rowid + 1));
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2),
+                         [&](const size_type k) {
+                           nnz_lno_t col = static_cast<nnz_lno_t>(U_entries(k));
+                           iw(my_team, col) = -1;
+                         });
   }
 };
 
@@ -379,23 +392,17 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map,
                   LValuesType &L_values, const URowMapType &U_row_map,
                   const UEntriesType &U_entries, UValuesType &U_values) {
   using execution_space         = typename IlukHandle::execution_space;
-  using memory_space            = typename IlukHandle::memory_space;
   using size_type               = typename IlukHandle::size_type;
   using nnz_lno_t               = typename IlukHandle::nnz_lno_t;
   using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t;
-  using WorkViewType =
-      Kokkos::View<nnz_lno_t **, Kokkos::Device<execution_space, memory_space>>;
-  using LevelHostViewType = Kokkos::View<nnz_lno_t *, Kokkos::HostSpace>;
+  using WorkViewType            = typename IlukHandle::work_view_t;
+  using LevelHostViewType       = typename IlukHandle::nnz_lno_view_host_t;
 
   size_type nlevels = thandle.get_num_levels();
-  size_type nrows   = thandle.get_nrows();
 
   // Keep these as host View, create device version and copy back to host
-  HandleDeviceEntriesType level_ptr     = thandle.get_level_ptr();
-  HandleDeviceEntriesType level_idx     = thandle.get_level_idx();
-  HandleDeviceEntriesType level_nchunks = thandle.get_level_nchunks();
-  HandleDeviceEntriesType level_nrowsperchunk =
-      thandle.get_level_nrowsperchunk();
+  HandleDeviceEntriesType level_ptr = thandle.get_level_ptr();
+  HandleDeviceEntriesType level_idx = thandle.get_level_idx();
 
   // Make level_ptr_h a separate allocation, since it will be accessed on host
   // between kernel launches. If a mirror were used and level_ptr is in UVM
@@ -409,25 +416,13 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map,
       level_ptr.extent(0));
   Kokkos::deep_copy(level_ptr_h, level_ptr);
 
+  //{
   if (thandle.get_algorithm() ==
       KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) {
-    level_nchunks_h = LevelHostViewType(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level nchunks"),
-        level_nchunks.extent(0));
-    level_nrowsperchunk_h =
-        LevelHostViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing,
-                                             "Host level nrowsperchunk"),
-                          level_nrowsperchunk.extent(0));
-    Kokkos::deep_copy(level_nchunks_h, level_nchunks);
-    Kokkos::deep_copy(level_nrowsperchunk_h, level_nrowsperchunk);
-    iw = WorkViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"),
-                      thandle.get_level_maxrowsperchunk(), nrows);
-    Kokkos::deep_copy(iw, nnz_lno_t(-1));
-  } else {
-    iw = WorkViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"),
-                      thandle.get_level_maxrows(), nrows);
-    Kokkos::deep_copy(iw, nnz_lno_t(-1));
+    level_nchunks_h       = thandle.get_level_nchunks();
+    level_nrowsperchunk_h = thandle.get_level_nrowsperchunk();
   }
+  iw = thandle.get_iw();
 
   // Main loop must be performed sequential. Question: Try out Cuda's graph
   // stuff to reduce kernel launch overhead
@@ -476,49 +471,13 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map,
           else
             Kokkos::parallel_for("parfor_l_team",
                                  policy_type(lvl_nrows_chunk, team_size), tstf);
-
+          Kokkos::fence();
           lvl_rowid_start += lvl_nrows_chunk;
         }
       }
-      //      /*
-      //      // TP2 algorithm has issues with some offset-ordinal combo to be
-      //      addressed else if ( thandle.get_algorithm() ==
-      //      KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) {
-      //        typedef Kokkos::TeamPolicy<execution_space> tvt_policy_type;
-      //
-      //        int team_size = thandle.get_team_size();
-      //        if ( team_size == -1 ) {
-      //          team_size = std::is_same< typename
-      //          Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace
-      //          >::value ? 1 : 128;
-      //        }
-      //        int vector_size = thandle.get_team_size();
-      //        if ( vector_size == -1 ) {
-      //          vector_size = std::is_same< typename
-      //          Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace
-      //          >::value ? 1 : 4;
-      //        }
-      //
-      //        // This impl: "chunk" lvl_nodes into node_groups; a league_rank
-      //        is responsible for processing that many nodes
-      //        //       TeamThreadRange over number of node_groups
-      //        //       To avoid masking threads, 1 thread (team) per node in
-      //        node_group
-      //        //       ThreadVectorRange responsible for the actual solve
-      //        computation const int node_groups = team_size;
-      //
-      //        LowerTriLvlSchedTP2SolverFunctor<RowMapType, EntriesType,
-      //        ValuesType, LHSType, RHSType, HandleDeviceEntriesType>
-      //        tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-      //        row_count, node_groups);
-      //        Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type(
-      //        (int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size,
-      //        vector_size ), tstf);
-      //      } // end elseif
-      //      */
-
     }  // end if
   }    // end for lvl
+  //}
 
 // Output check
 #ifdef NUMERIC_OUTPUT_INFO
@@ -526,7 +485,7 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map,
 
   std::cout << "  nnzL: " << thandle.get_nnzL() << std::endl;
   std::cout << "  L_row_map = ";
-  for (size_type i = 0; i < nrows + 1; ++i) {
+  for (size_type i = 0; i < thandle.get_nrows() + 1; ++i) {
     std::cout << L_row_map(i) << " ";
   }
   std::cout << std::endl;
@@ -545,7 +504,7 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map,
 
   std::cout << "  nnzU: " << thandle.get_nnzU() << std::endl;
   std::cout << "  U_row_map = ";
-  for (size_type i = 0; i < nrows + 1; ++i) {
+  for (size_type i = 0; i < thandle.get_nrows() + 1; ++i) {
     std::cout << U_row_map(i) << " ";
   }
   std::cout << std::endl;
diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp
index 90bb88e057..691d624963 100644
--- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp
@@ -121,15 +121,15 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map,
 
 // SEQLVLSCHD_TP1 algorithm (chunks)
 template <class IlukHandle, class RowMapType, class EntriesType,
-          class LevelType1, class LevelType2, class LevelType3, class size_type>
-void level_sched(IlukHandle& thandle, const RowMapType row_map,
-                 const EntriesType entries, LevelType1& level_list,
-                 LevelType2& level_ptr, LevelType2& level_idx,
-                 LevelType3& level_nchunks, LevelType3& level_nrowsperchunk,
-                 size_type& nlevels) {
+          class LevelType1, class LevelType2, class size_type>
+void level_sched_tp(IlukHandle& thandle, const RowMapType row_map,
+                    const EntriesType entries, LevelType1& level_list,
+                    LevelType2& level_ptr, LevelType2& level_idx,
+                    size_type& nlevels) {
   // Scheduling currently compute on host
 
-  using nnz_lno_t = typename IlukHandle::nnz_lno_t;
+  using nnz_lno_t           = typename IlukHandle::nnz_lno_t;
+  using nnz_lno_view_host_t = typename IlukHandle::nnz_lno_view_host_t;
 
   size_type nrows = thandle.get_nrows();
 
@@ -168,11 +168,10 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map,
   level_ptr(0) = 0;
 
   // Find max rows, number of chunks, max rows of chunks across levels
-  using HostViewType =
-      Kokkos::View<nnz_lno_t*, Kokkos::LayoutLeft, Kokkos::HostSpace>;
-
-  HostViewType lnchunks("lnchunks", nlevels);
-  HostViewType lnrowsperchunk("lnrowsperchunk", nlevels);
+  thandle.alloc_level_nchunks(nlevels);
+  thandle.alloc_level_nrowsperchunk(nlevels);
+  nnz_lno_view_host_t lnchunks       = thandle.get_level_nchunks();
+  nnz_lno_view_host_t lnrowsperchunk = thandle.get_level_nrowsperchunk();
 
 #ifdef KOKKOS_ENABLE_CUDA
   using memory_space = typename IlukHandle::memory_space;
@@ -214,9 +213,6 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map,
   thandle.set_num_levels(nlevels);
   thandle.set_level_maxrows(maxrows);
   thandle.set_level_maxrowsperchunk(maxrowsperchunk);
-
-  level_nchunks       = lnchunks;
-  level_nrowsperchunk = lnrowsperchunk;
 }
 
 // Linear Search for the smallest row index
@@ -326,7 +322,6 @@ void iluk_symbolic(IlukHandle& thandle,
     HostTmpViewType h_iw("h_iw", nrows);
     HostTmpViewType h_iL("h_iL", nrows);
     HostTmpViewType h_llev("h_llev", nrows);
-    HostTmpViewType level_nchunks, level_nrowsperchunk;
 
     size_type cntL = 0;
     size_type cntU = 0;
@@ -472,19 +467,13 @@ void iluk_symbolic(IlukHandle& thandle,
     // Level scheduling on L
     if (thandle.get_algorithm() ==
         KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) {
-      level_sched(thandle, L_row_map, L_entries, level_list, level_ptr,
-                  level_idx, level_nchunks, level_nrowsperchunk, nlev);
-
-      thandle.alloc_level_nchunks(nlev);
-      thandle.alloc_level_nrowsperchunk(nlev);
-      HandleDeviceEntriesType dlevel_nchunks = thandle.get_level_nchunks();
-      HandleDeviceEntriesType dlevel_nrowsperchunk =
-          thandle.get_level_nrowsperchunk();
-      Kokkos::deep_copy(dlevel_nchunks, level_nchunks);
-      Kokkos::deep_copy(dlevel_nrowsperchunk, level_nrowsperchunk);
+      level_sched_tp(thandle, L_row_map, L_entries, level_list, level_ptr,
+                     level_idx, nlev);
+      thandle.alloc_iw(thandle.get_level_maxrowsperchunk(), nrows);
     } else {
       level_sched(thandle, L_row_map, L_entries, level_list, level_ptr,
                   level_idx, nlev);
+      thandle.alloc_iw(thandle.get_level_maxrows(), nrows);
     }
 
     Kokkos::deep_copy(dlevel_ptr, level_ptr);
diff --git a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp
index 7132ec0fe1..14b75f1c39 100644
--- a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp
@@ -101,10 +101,10 @@ struct spmv_mv_blockcrsmatrix_eti_spec_avail {
       const SCALAR_TYPE, const ORDINAL_TYPE,                              \
       Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
       Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,         \
-      SCALAR_TYPE const *, LAYOUT_TYPE,                                   \
+      SCALAR_TYPE const **, LAYOUT_TYPE,                                  \
       Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
       Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,     \
-      SCALAR_TYPE *, LAYOUT_TYPE,                                         \
+      SCALAR_TYPE **, LAYOUT_TYPE,                                        \
       Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
       Kokkos::MemoryTraits<Kokkos::Unmanaged> > {                         \
     enum : bool { value = true };                                         \
diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp
index b87a9fa460..313098372a 100644
--- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp
@@ -46,6 +46,7 @@
 #define KOKKOSSPARSE_IMPL_SPMV_BSRMATRIX_IMPL_HPP_
 
 #include "KokkosKernels_Error.hpp"
+#include "KokkosKernels_ExecSpaceUtils.hpp"
 
 #if defined(KOKKOS_ENABLE_CUDA) && \
     (defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_AMPERE))
@@ -320,10 +321,8 @@ struct BsrMatrixSpMVTensorCoreFunctor {
     // no barrier - each warp uses independent shared memory
 
     // load from the shared memory
-#ifdef __CUDA_ARCH__
     load_matrix_sync(fy, &sy(warpIdx_y, warpIdx_x, 0, 0), FRAG_N,
                      nvcuda::wmma::mem_row_major);
-#endif
 
     auto rowView = a.block_row_Const(blockIdx_i);
 
@@ -363,17 +362,12 @@ struct BsrMatrixSpMVTensorCoreFunctor {
           const AOrdinal bj = bk + tj;
 
           // fill shmem with 0 outside of the block boundary
-#ifdef __CUDA_ARCH__
           if (bi < a.blockDim() && bj < a.blockDim()) {
             sa(ti / FRAG_M, ti % FRAG_M, tj) =
                 AFragScalar(alpha * ap[bi * a.blockDim() + bj]);
           } else {
             sa(ti / FRAG_M, ti % FRAG_M, tj) = AFragScalar(0);
           }
-#else
-          (void)bi;
-          (void)bj;
-#endif
         }
 
         // collaborative load of X fragments into shared memory
@@ -391,7 +385,6 @@ struct BsrMatrixSpMVTensorCoreFunctor {
           // load 0 outside of the block boundary
           // x is not necessarily a multiple of block size, so make sure access
           // is in bounds
-#ifdef __CUDA_ARCH__
           if (bi < a.blockDim() && bj < a.blockDim() &&
               unsigned(blockIdx_j * a.blockDim() + bj) < x.extent(1)) {
             // tile is some fragments in the j/n direction that are frag_n wide
@@ -400,15 +393,10 @@ struct BsrMatrixSpMVTensorCoreFunctor {
           } else {
             sx(tj / FRAG_N, ti, tj % FRAG_N) = XFragScalar(0);
           }
-#else
-          (void)bi;
-          (void)bj;
-#endif
         }
         mbr.team_barrier();
 
         // load correct fragment from shared memory and accumulate
-#ifdef __CUDA_ARCH__
         // only need to do any math if our fragment will write a result back to
         // Y
         if (ay_i < static_cast<AOrdinal>(y.extent(0)) &&
@@ -417,17 +405,12 @@ struct BsrMatrixSpMVTensorCoreFunctor {
           load_matrix_sync(fx, &sx(warpIdx_x, 0, 0), FRAG_N);
           mma_sync(fy, fa, fx, fy);
         }
-#endif
       }
-      (void)j;
-      (void)ap;
     }  // loop through blocks in row of A
 
-#ifdef __CUDA_ARCH__
     // store Y fragments into shared memory
     store_matrix_sync(&sy(warpIdx_y, warpIdx_x, 0, 0), fy, FRAG_N,
                       nvcuda::wmma::mem_row_major);
-#endif
     // team loads its fragments of Y that make up part or all of the block of Y
     // it's responsible for. each warp loads the part corresponding to its y
     // fragment
@@ -447,21 +430,16 @@ struct BsrMatrixSpMVTensorCoreFunctor {
       }
     }
     mbr.team_barrier();
-
-    // Suppress unused var warnings
-    // TODO (@cwpearson): Should this functor only compile on device?
-    (void)fx;
-    (void)fa;
-    (void)fy;
   }
 };
 
-/* Instantiate some common template parameter values
-   for BsrMatrixSpMVTensorCoreFunctor.
-   This is a struct instead of a function for template...using shorthand
-   Discriminates between complex (supported) and non-complex (unsupported)
-   scalar types, and throws a runtime error for unsupported types
-*/
+/// \brief Avoid instantiating tensor core functor for unsupported types
+///
+/// Instantiate some common template parameter values
+/// for BsrMatrixSpMVTensorCoreFunctor.
+/// This is a struct instead of a function for template...using shorthand
+/// Discriminates between non-complex/on-GPU (supported) and otherwise
+/// (unsupported) scalar types, and throws a runtime error for unsupported types
 template <typename AMatrix,
           typename AFragScalar,  // input matrix type and fragment scalar type
           typename XMatrix, typename XFragScalar, typename YMatrix,
@@ -517,10 +495,11 @@ struct BsrMatrixSpMVTensorCoreDispatcher {
   static void tag_dispatch(std::false_type, YScalar, AMatrix, XMatrix, YScalar,
                            YMatrix) {
     KokkosKernels::Impl::throw_runtime_exception(
-        "unsupported for complex types");
+        "Tensor core SpMV is only supported for non-complex types in GPU "
+        "execution spaces");
   }
 
-  /*true if T1, T2, or T3 are complex*/
+  /*true if none of T1, T2, or T3 are complex*/
   template <typename T1, typename T2, typename T3>
   struct none_complex {
     const static bool value = !Kokkos::ArithTraits<T1>::is_complex &&
@@ -528,11 +507,22 @@ struct BsrMatrixSpMVTensorCoreDispatcher {
                               !Kokkos::ArithTraits<T3>::is_complex;
   };
 
+  /*true if T1::execution_space, T2, or T3 are all GPU exec space*/
+  template <typename T1, typename T2, typename T3>
+  struct all_gpu {
+    const static bool value = KokkosKernels::Impl::kk_is_gpu_exec_space<T1>() &&
+                              KokkosKernels::Impl::kk_is_gpu_exec_space<T2>() &&
+                              KokkosKernels::Impl::kk_is_gpu_exec_space<T3>();
+  };
+
   static void dispatch(YScalar alpha, AMatrix a, XMatrix x, YScalar beta,
                        YMatrix y) {
-    using tag =
-        std::integral_constant<bool,
-                               none_complex<AScalar, XScalar, YScalar>::value>;
+    // tag will be false unless all conditions are met
+    using tag = std::integral_constant<
+        bool, none_complex<AScalar, XScalar, YScalar>::value &&
+                  all_gpu<typename AMatrix::execution_space,
+                          typename XMatrix::execution_space,
+                          typename YMatrix::execution_space>::value>;
     tag_dispatch(tag{}, alpha, a, x, beta, y);
   }
 };
@@ -552,7 +542,7 @@ struct BsrMatrixSpMVTensorCoreDispatcher {
 #include "KokkosBatched_Gemv_TeamVector_Internal.hpp"
 #include "KokkosBatched_Gemm_Serial_Internal.hpp"
 #include "KokkosBatched_Gemm_TeamVector_Internal.hpp"
-#include "KokkosBatched_Scale_Internal.hpp"
+#include "KokkosBlas1_team_scal_impl.hpp"
 #include "KokkosKernels_ExecSpaceUtils.hpp"
 
 #include "KokkosBlas2_team_gemv_spec.hpp"
@@ -649,7 +639,7 @@ struct BSR_GEMV_Functor {
     const y_value_type val_one = Kokkos::ArithTraits<y_value_type>::one();
     ;
     if (beta != val_one) {
-      KokkosBatched::TeamVectorScaleInternal::invoke(
+      KokkosBlas::Impl::TeamVectorScaleInternal::invoke(
           dev, block_dim, beta, Y_cur.data(),
           static_cast<int>(Y_cur.stride_0()));
     }
@@ -979,6 +969,8 @@ struct BSR_GEMV_Transpose_Functor {
                                Kokkos::atomic_add(&Y_cur(ijk),
                                                   shared_view(ijk));
                              });
+        //
+        dev.team_barrier();
       }
     } else {
       for (ordinal_type jBlock = 0; jBlock < count; ++jBlock) {
@@ -1008,6 +1000,8 @@ struct BSR_GEMV_Transpose_Functor {
                              [&](const ordinal_type &ijk) {
                                Kokkos::atomic_add(&Y_cur(ijk), shared_y[ijk]);
                              });
+        //
+        dev.team_barrier();
       }
     }
   }
@@ -1282,7 +1276,7 @@ struct BSR_GEMM_Functor {
 
     const y_value_type val_one = Kokkos::ArithTraits<y_value_type>::one();
     if (beta != val_one) {
-      KokkosBatched::TeamVectorScaleInternal::invoke(
+      KokkosBlas::Impl::TeamVectorScaleInternal::invoke(
           dev, block_dim, num_rhs, beta, Y_cur.data(),
           static_cast<int>(Y_cur.stride_0()),
           static_cast<int>(Y_cur.stride_1()));
diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp
index 4d6d6cd1b5..52bbb2f839 100644
--- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp
@@ -201,88 +201,125 @@ struct SPMV_MV_BSRMATRIX<AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM,
   typedef Kokkos::View<YT, YL, YD, YM> YVector;
   typedef typename YVector::non_const_value_type YScalar;
 
+  enum class Method {
+    Fallback,    ///< Don't use tensor cores
+    TensorCores  ///< use tensor cores
+  };
+
+  /// Precision to use in the tensor core implementation
+  enum class Precision {
+    Automatic,  ///< Use Double, unless operations match mixed precision
+    Double,     ///< fp64 += fp64 * fp64
+    Mixed       ///< fp32 += fp16 * fp16
+  };
+
   static void spmv_mv_bsrmatrix(
       const KokkosKernels::Experimental::Controls &controls, const char mode[],
       const YScalar &alpha, const AMatrix &A, const XVector &X,
       const YScalar &beta, const YVector &Y) {
 #if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA)
-    // user explicitly requests a particular precision
-    bool requestMixed  = false;
-    bool requestDouble = false;
-    if (controls.isParameter("tc_precision")) {
-      if (controls.getParameter("tc_precision") == "mixed") {
-        requestMixed = true;
-      } else if (controls.getParameter("tc_precision") == "double") {
-        requestDouble = true;
-      }
-    }
-    //
-    bool use_tc = false;
-    if ((controls.isParameter("algorithm")) &&
-        (controls.getParameter("algorithm") == "experimental_bsr_tc")) {
-      if (Kokkos::Details::ArithTraits<YScalar>::is_complex == false)
-        use_tc = true;
+    Method method = Method::Fallback;
+    {
+      typedef typename AMatrix::non_const_value_type AScalar;
+      typedef typename XVector::non_const_value_type XScalar;
+      // try to use tensor cores if requested
+      if (controls.getParameter("algorithm") == "experimental_bsr_tc")
+        method = Method::TensorCores;
+      // can't use tensor cores for complex
+      if (Kokkos::Details::ArithTraits<YScalar>::is_complex)
+        method = Method::Fallback;
+      if (Kokkos::Details::ArithTraits<XScalar>::is_complex)
+        method = Method::Fallback;
+      if (Kokkos::Details::ArithTraits<AScalar>::is_complex)
+        method = Method::Fallback;
+      // can't use tensor cores outside GPU
+      if (!KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename AMatrix::execution_space>())
+        method = Method::Fallback;
+      if (!KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename XVector::execution_space>())
+        method = Method::Fallback;
+      if (!KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename YVector::execution_space>())
+        method = Method::Fallback;
+      // can't use tensor cores unless mode is no-transpose
+      if (mode[0] != KokkosSparse::NoTranspose[0]) method = Method::Fallback;
+#if KOKKOS_HALF_T_IS_FLOAT
+      // disable tensor cores when Kokkos half is actually a float
+      method = Method::Fallback;
+#endif  // KOKKOS_HALF_T_IS_FLOAT
     }
-#endif
+#endif  // AMPERE || VOLTA
 
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ARCH_AMPERE)
-    typedef typename XVector::non_const_value_type XScalar;
-    typedef typename AMatrix::non_const_value_type AScalar;
-    typedef Kokkos::Experimental::half_t Half;
-
-    /* Ampere has double += double * double and float += half * half
-
-    use whichever is requested.
-    If none requested, used mixed precision if the inputs are mixed, otherwise
-    use double
-    */
-
-    // input precision matches a tensor core fragment type
-    constexpr bool operandsHalfHalfFloat = std::is_same<AScalar, Half>::value &&
-                                           std::is_same<XScalar, Half>::value &&
-                                           std::is_same<YScalar, float>::value;
-
-    if (use_tc) {
-      if (requestMixed) {
-        BsrMatrixSpMVTensorCoreDispatcher<AMatrix, half, XVector, half, YVector,
-                                          float, 16, 16, 16>::dispatch(alpha, A,
-                                                                       X, beta,
-                                                                       Y);
-        return;
-      } else if (requestDouble) {
-        BsrMatrixSpMVTensorCoreDispatcher<AMatrix, double, XVector, double,
-                                          YVector, double, 8, 8,
-                                          4>::dispatch(alpha, A, X, beta, Y);
-        return;
-      } else if (operandsHalfHalfFloat) {
+    {
+      typedef Kokkos::Experimental::half_t Half;
+      typedef typename AMatrix::non_const_value_type AScalar;
+      typedef typename XVector::non_const_value_type XScalar;
+
+      /* Ampere has double += double * double and float += half * half
+
+      use whichever is requested.
+      If none requested, used mixed precision if the inputs are mixed, otherwise
+      use double
+      */
+      if (Method::TensorCores == method) {
+        Precision precision = Precision::Automatic;
+        if (controls.getParameter("tc_precision") == "mixed")
+          precision = Precision::Mixed;
+        else if (controls.getParameter("tc_precision") == "double")
+          precision = Precision::Double;
+
+        switch (precision) {
+          case Precision::Mixed: {
+            BsrMatrixSpMVTensorCoreDispatcher<AMatrix, half, XVector, half,
+                                              YVector, float, 16, 16,
+                                              16>::dispatch(alpha, A, X, beta,
+                                                            Y);
+            return;
+          }
+          case Precision::Double: {
+            BsrMatrixSpMVTensorCoreDispatcher<AMatrix, double, XVector, double,
+                                              YVector, double, 8, 8,
+                                              4>::dispatch(alpha, A, X, beta,
+                                                           Y);
+            return;
+          }
+          case Precision::Automatic:  // fallthrough
+          default: {
+            constexpr bool operandsHalfHalfFloat =
+                std::is_same<AScalar, Half>::value &&
+                std::is_same<XScalar, Half>::value &&
+                std::is_same<YScalar, float>::value;
+            if (operandsHalfHalfFloat) {
+              BsrMatrixSpMVTensorCoreDispatcher<AMatrix, half, XVector, half,
+                                                YVector, float, 16, 16,
+                                                16>::dispatch(alpha, A, X, beta,
+                                                              Y);
+              return;
+            } else {
+              BsrMatrixSpMVTensorCoreDispatcher<AMatrix, double, XVector,
+                                                double, YVector, double, 8, 8,
+                                                4>::dispatch(alpha, A, X, beta,
+                                                             Y);
+              return;
+            }
+          }
+        }
+      }
+    }
+#elif defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ARCH_VOLTA)
+    {
+      /* Volta has float += half * half
+         use it for all matrices
+      */
+      if (Method::TensorCores == method) {
         BsrMatrixSpMVTensorCoreDispatcher<AMatrix, half, XVector, half, YVector,
                                           float, 16, 16, 16>::dispatch(alpha, A,
                                                                        X, beta,
                                                                        Y);
         return;
-      } else {
-        BsrMatrixSpMVTensorCoreDispatcher<AMatrix, double, XVector, double,
-                                          YVector, double, 8, 8,
-                                          4>::dispatch(alpha, A, X, beta, Y);
-        return;
-      }
-    }
-#elif defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ARCH_VOLTA)
-    /* Volta has float += half * half
-       use it for all matrices
-    */
-    if (use_tc) {
-      if (requestDouble) {
-        KokkosKernels::Impl::throw_runtime_exception(
-            "KokkosSparse::spmv[algorithm=experimental_bsr_tc] "
-            "tc_precision=double unsupported KOKKOS_ARCH_VOLTA");
       }
-      BsrMatrixSpMVTensorCoreDispatcher<AMatrix, half, XVector, half, YVector,
-                                        float, 16, 16, 16>::dispatch(alpha, A,
-                                                                     X, beta,
-                                                                     Y);
-      (void)requestMixed;  // unused
-      return;
     }
 #endif  // KOKKOS_ARCH
 
diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
index 41843d8674..fcd02a851e 100644
--- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
@@ -94,7 +94,7 @@ struct SPMV_Transpose_Functor {
   AMatrix m_A;
   XVector m_x;
   YVector m_y;
-  ordinal_type rows_per_team;
+  ordinal_type rows_per_team = 0;
 
   SPMV_Transpose_Functor(const coefficient_type& alpha_, const AMatrix& m_A_,
                          const XVector& m_x_, const YVector& m_y_)
@@ -725,7 +725,7 @@ struct SPMV_MV_Transpose_Functor {
   YVector m_y;
 
   const ordinal_type n;
-  ordinal_type rows_per_team;
+  ordinal_type rows_per_team = 0;
 
   SPMV_MV_Transpose_Functor(const coefficient_type& alpha_, const AMatrix& m_A_,
                             const XVector& m_x_, const coefficient_type& beta_,
diff --git a/src/sparse/impl/KokkosSparse_spmv_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_spec.hpp
index e0fdb2b6cd..cc29d72b77 100644
--- a/src/sparse/impl/KokkosSparse_spmv_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_spec.hpp
@@ -111,6 +111,8 @@ struct spmv_mv_eti_spec_avail {
 // Include the actual specialization declarations
 #include <KokkosSparse_spmv_tpl_spec_avail.hpp>
 #include <generated_specializations_hpp/KokkosSparse_spmv_eti_spec_avail.hpp>
+
+#include <KokkosSparse_spmv_mv_tpl_spec_avail.hpp>
 #include <generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_avail.hpp>
 
 namespace KokkosSparse {
@@ -204,7 +206,8 @@ struct SPMV_MV {
   typedef Kokkos::View<YT, YL, YD, YM> YVector;
   typedef typename YVector::non_const_value_type coefficient_type;
 
-  static void spmv_mv(const char mode[], const coefficient_type& alpha,
+  static void spmv_mv(const KokkosKernels::Experimental::Controls& controls,
+                      const char mode[], const coefficient_type& alpha,
                       const AMatrix& A, const XVector& x,
                       const coefficient_type& beta, const YVector& y);
 };
@@ -261,7 +264,8 @@ struct SPMV_MV<AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM, false, false,
   typedef Kokkos::View<YT, YL, YD, YM> YVector;
   typedef typename YVector::non_const_value_type coefficient_type;
 
-  static void spmv_mv(const char mode[], const coefficient_type& alpha,
+  static void spmv_mv(const KokkosKernels::Experimental::Controls& /*controls*/,
+                      const char mode[], const coefficient_type& alpha,
                       const AMatrix& A, const XVector& x,
                       const coefficient_type& beta, const YVector& y) {
     typedef Kokkos::Details::ArithTraits<coefficient_type> KAT;
@@ -287,7 +291,8 @@ struct SPMV_MV<AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM, true, false,
   typedef Kokkos::View<YT, YL, YD, YM> YVector;
   typedef typename YVector::non_const_value_type coefficient_type;
 
-  static void spmv_mv(const char mode[], const coefficient_type& alpha,
+  static void spmv_mv(const KokkosKernels::Experimental::Controls& /*controls*/,
+                      const char mode[], const coefficient_type& alpha,
                       const AMatrix& A, const XVector& x,
                       const coefficient_type& beta, const YVector& y) {
     static_assert(std::is_integral<AT>::value,
@@ -377,6 +382,8 @@ struct SPMV_MV<AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM, true, false,
 
 #include <KokkosSparse_spmv_tpl_spec_decl.hpp>
 #include <generated_specializations_hpp/KokkosSparse_spmv_eti_spec_decl.hpp>
+
+#include <KokkosSparse_spmv_mv_tpl_spec_decl.hpp>
 #include <generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_decl.hpp>
 
 #endif  // KOKKOSSPARSE_IMPL_SPMV_SPEC_HPP_
diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
index 7943b1e602..fbee2fb33f 100644
--- a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
@@ -1141,8 +1141,7 @@ struct UpperTriSupernodalFunctor {
         KokkosBatched::TeamTrsm<
             member_type, KokkosBatched::Side::Left, KokkosBatched::Uplo::Lower,
             KokkosBatched::Trans::Transpose, KokkosBatched::Diag::NonUnit,
-            KokkosBatched::Algo::Trsm::Unblocked>::template invoke(team, one,
-                                                                   Ujj, Xjj);
+            KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj, Xjj);
       }
       team.team_barrier();
     }
diff --git a/src/sparse/impl/KokkosSparse_trsv_impl.hpp b/src/sparse/impl/KokkosSparse_trsv_impl.hpp
index f076368827..bff037c228 100644
--- a/src/sparse/impl/KokkosSparse_trsv_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_trsv_impl.hpp
@@ -218,6 +218,7 @@ void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A,
   typename CrsMatrixType::row_map_type ptr = A.graph.row_map;
   typename CrsMatrixType::index_type ind   = A.graph.entries;
   typename CrsMatrixType::values_type val  = A.values;
+  typedef Kokkos::Details::ArithTraits<matrix_scalar_type> STS;
 
   // If local_ordinal_type is unsigned and numRows is 0, the loop
   // below will have entirely the wrong number of iterations.
@@ -232,15 +233,18 @@ void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A,
     for (local_ordinal_type j = 0; j < numVecs; ++j) {
       X(r, j) = Y(r, j);
     }
-    const offset_type beg = ptr(r);
-    const offset_type end = ptr(r + 1);
-    // We assume the diagonal entry is first in the row.
-    const matrix_scalar_type A_rr = val(beg);
-    for (offset_type k = beg + static_cast<offset_type>(1); k < end; ++k) {
+    const offset_type beg   = ptr(r);
+    const offset_type end   = ptr(r + 1);
+    matrix_scalar_type A_rr = STS::zero();
+    for (offset_type k = beg; k < end; ++k) {
       const matrix_scalar_type A_rc = val(k);
       const local_ordinal_type c    = ind(k);
-      for (local_ordinal_type j = 0; j < numVecs; ++j) {
-        X(r, j) -= A_rc * X(c, j);
+      if (r == c) {
+        A_rr += A_rc;
+      } else {
+        for (local_ordinal_type j = 0; j < numVecs; ++j) {
+          X(r, j) -= A_rc * X(c, j);
+        }
       }
     }  // for each entry A_rc in the current row r
     for (local_ordinal_type j = 0; j < numVecs; ++j) {
@@ -254,15 +258,18 @@ void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A,
     for (local_ordinal_type j = 0; j < numVecs; ++j) {
       X(r, j) = Y(r, j);
     }
-    const offset_type beg = ptr(r);
-    const offset_type end = ptr(r + 1);
-    // We assume the diagonal entry is first in the row.
-    const matrix_scalar_type A_rr = val(beg);
-    for (offset_type k = beg + 1; k < end; ++k) {
+    const offset_type beg   = ptr(r);
+    const offset_type end   = ptr(r + 1);
+    matrix_scalar_type A_rr = STS::zero();
+    for (offset_type k = beg; k < end; ++k) {
       const matrix_scalar_type A_rc = val(k);
       const local_ordinal_type c    = ind(k);
-      for (local_ordinal_type j = 0; j < numVecs; ++j) {
-        X(r, j) -= A_rc * X(c, j);
+      if (r == c)
+        A_rr += A_rc;
+      else {
+        for (local_ordinal_type j = 0; j < numVecs; ++j) {
+          X(r, j) -= A_rc * X(c, j);
+        }
       }
     }  // for each entry A_rc in the current row r
     for (local_ordinal_type j = 0; j < numVecs; ++j) {
diff --git a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp
index 19bc5ec163..d779ff3e96 100644
--- a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp
@@ -57,7 +57,7 @@
 
 // needed for classical GS
 #include "KokkosSparse_sptrsv.hpp"
-#include "KokkosKernels_SparseUtils.hpp"
+#include "KokkosSparse_Utils.hpp"
 
 #include "KokkosSparse_gauss_seidel_handle.hpp"
 
@@ -854,11 +854,11 @@ class TwostageGaussSeidel {
                                                // values
         // CuSparse needs matrix sorted by column indexes for each row
         // TODO: may need to move this to symbolic/numeric of sptrsv
-        KokkosKernels::sort_crs_matrix<execution_space, const_row_map_view_t,
-                                       entries_view_t, values_view_t>(
+        KokkosSparse::sort_crs_matrix<execution_space, const_row_map_view_t,
+                                      entries_view_t, values_view_t>(
             rowmap_viewL, column_viewL, values_viewL);
-        KokkosKernels::sort_crs_matrix<execution_space, const_row_map_view_t,
-                                       entries_view_t, values_view_t>(
+        KokkosSparse::sort_crs_matrix<execution_space, const_row_map_view_t,
+                                      entries_view_t, values_view_t>(
             rowmap_viewU, column_viewU, values_viewU);
 
         // now do symbolic
diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp
index 2878543f33..976da2c358 100644
--- a/test_common/KokkosKernels_TestUtils.hpp
+++ b/test_common/KokkosKernels_TestUtils.hpp
@@ -45,7 +45,10 @@
 #ifndef KOKKOSKERNELS_TEST_UTILS_HPP
 #define KOKKOSKERNELS_TEST_UTILS_HPP
 
+#include <random>
+
 #include "KokkosKernels_Utils.hpp"
+#include "KokkosKernels_IOUtils.hpp"
 #include "Kokkos_ArithTraits.hpp"
 #include "KokkosSparse_spmv.hpp"
 // Make this include-able from all subdirectories
@@ -338,6 +341,15 @@ class epsilon<Kokkos::Experimental::half_t> {
 };
 #endif  // KOKKOS_HALF_T_IS_FLOAT
 
+// explicit epsilon specializations
+#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
+template <>
+class epsilon<Kokkos::Experimental::bhalf_t> {
+ public:
+  constexpr static double value = 0.0078125F;
+};
+#endif  // KOKKOS_HALF_T_IS_FLOAT
+
 using KokkosKernels::Impl::getRandomBounds;
 
 template <typename scalar_t, typename lno_t, typename size_type,
@@ -488,5 +500,158 @@ int string_compare_no_case(const char* str1, const char* str2) {
   return strcmp(str1_s.c_str(), str2_s.c_str());
 }
 
+/// /brief Csc matrix class for testing purposes.
+/// \tparam ScalarType
+/// \tparam LayoutType
+/// \tparam ExeSpaceType
+template <class ScalarType, class LayoutType, class ExeSpaceType>
+class RandCscMat {
+ private:
+  using ValViewTypeD    = Kokkos::View<ScalarType*, LayoutType, ExeSpaceType>;
+  using RowIdViewTypeD  = Kokkos::View<int64_t*, LayoutType, ExeSpaceType>;
+  using ColMapViewTypeD = Kokkos::View<int64_t*, LayoutType, ExeSpaceType>;
+  int64_t __nrows;
+  int64_t __ncols;
+  int64_t __nnz = 0;
+  ColMapViewTypeD __col_map_d;
+  RowIdViewTypeD __row_ids_d;
+  ValViewTypeD __vals_d;
+  using ColMapViewTypeH = typename ColMapViewTypeD::HostMirror;
+  using RowIdViewTypeH  = typename RowIdViewTypeD::HostMirror;
+  using ValViewTypeH    = typename ValViewTypeD::HostMirror;
+  ColMapViewTypeH __col_map;
+  RowIdViewTypeH __row_ids;
+  ValViewTypeH __vals;
+  bool __fully_sparse;
+
+  /// Generates a random column map where:
+  ///  1. __col_map(i) is in [__row_ids.data(), &row_ids.data()[nnz - 1]
+  ///  2. __col_map(i) > col_map(i - 1) for i > 1
+  ///  3. __col_map(i) == col_map(j) iff __col_map(i) == col_map(j) == nullptr
+  ///  4. __col_map(i) - col_map(i - 1) is in [0, m]
+  void __populate_random_csc_mat(uint64_t ticks) {
+    std::srand(ticks);
+    for (int64_t col_idx = 0; col_idx < __ncols; col_idx++) {
+      int64_t r = std::rand() % (__nrows + 1);
+      if (r == 0 || __fully_sparse) {  // 100% sparse column
+        __col_map(col_idx) = __nnz;
+      } else {  // sparse column with r elements
+        // Populate r row ids
+        std::vector<int64_t> v(r);
+
+        for (int64_t i = 0; i < r; i++) v.at(i) = i;
+
+        std::shuffle(v.begin(), v.end(), std::mt19937(std::random_device()()));
+
+        for (int64_t i = 0; i < r; i++) __row_ids(i + __nnz) = v.at(i);
+
+        // Point to new column and accumulate number of non zeros
+        __col_map(col_idx) = __nnz;
+        __nnz += r;
+      }
+    }
+
+    // last entry in map points to end of row id list
+    __col_map(__ncols) = __nnz;
+
+    // Copy to device
+    Kokkos::deep_copy(__col_map_d, __col_map);
+    Kokkos::deep_copy(__row_ids_d, __row_ids);
+    ExeSpaceType().fence();
+  }
+
+  template <class T>
+  T __getter_copy_helper(T src) {
+    T dst(std::string("RandCscMat.") + typeid(T).name() + " copy",
+          src.extent(0));
+    Kokkos::deep_copy(dst, src);
+    ExeSpaceType().fence();
+    return dst;
+  }
+
+ public:
+  std::string info;
+  /// Constructs a random csc matrix.
+  /// \param m The number of rows.
+  /// \param n The number of columns.
+  /// \param min_val The minimum scalar value in the matrix.
+  /// \param max_val The maximum scalar value in the matrix.
+  RandCscMat(int64_t m, int64_t n, ScalarType min_val, ScalarType max_val,
+             bool fully_sparse = false) {
+    __ncols        = n;
+    __nrows        = m;
+    __fully_sparse = fully_sparse;
+    __col_map_d    = ColMapViewTypeD("RandCscMat.ColMapViewType", __ncols + 1);
+    __col_map      = Kokkos::create_mirror_view(__col_map_d);
+    __row_ids_d    = RowIdViewTypeD("RandCscMat.RowIdViewType",
+                                 m * n + 1);  // over-allocated
+    __row_ids      = Kokkos::create_mirror_view(__row_ids_d);
+
+    uint64_t ticks =
+        std::chrono::high_resolution_clock::now().time_since_epoch().count() %
+        UINT32_MAX;
+
+    info = std::string(
+        std::string("RandCscMat<") + typeid(ScalarType).name() + ", " +
+        typeid(LayoutType).name() + ", " + typeid(ExeSpaceType).name() + ">(" +
+        std::to_string(m) + ", " + std::to_string(n) +
+        "...): rand seed: " + std::to_string(ticks) +
+        ", fully sparse: " + (__fully_sparse ? "true" : "false") + "\n");
+    Kokkos::Random_XorShift64_Pool<Kokkos::HostSpace> random(ticks);
+    __populate_random_csc_mat(ticks);
+
+    __vals_d = ValViewTypeD("RandCscMat.ValViewType", __nnz + 1);
+    __vals   = Kokkos::create_mirror_view(__vals_d);
+    Kokkos::fill_random(__vals, random, min_val, max_val);  // random scalars
+    Kokkos::fence();
+    __vals(__nnz) = ScalarType(0);
+
+    // Copy to device
+    Kokkos::deep_copy(__vals_d, __vals);
+    ExeSpaceType().fence();
+  }
+
+  // O(c), where c is a constant.
+  ScalarType operator()(int64_t idx) { return __vals(idx); }
+
+  int64_t get_nnz() { return __nnz; }
+  int64_t get_m() { return __nrows; }
+  int64_t get_n() { return __ncols; }
+  int64_t get_col_len(int64_t j) {
+    return j < __ncols ? (__col_map(j + 1) - __col_map(j)) : 0;
+  }
+  int64_t get_col_start(int64_t j) { return j < __ncols ? __col_map(j) : 0; }
+  ValViewTypeD get_vals() { return __getter_copy_helper(__vals_d); }
+  RowIdViewTypeD get_row_ids() { return __getter_copy_helper(__row_ids_d); }
+  ColMapViewTypeD get_col_map() { return __getter_copy_helper(__col_map_d); }
+};
+
+/// \brief Randomly shuffle the entries in each row (col) of a Crs (Ccs) matrix.
+template <typename Rowptrs, typename Entries, typename Values>
+void shuffleMatrixEntries(Rowptrs rowptrs, Entries entries, Values values) {
+  using size_type    = typename Rowptrs::non_const_value_type;
+  using ordinal_type = typename Entries::value_type;
+  auto rowptrsHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowptrs);
+  auto entriesHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries);
+  auto valuesHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), values);
+  ordinal_type numRows =
+      rowptrsHost.extent(0) ? (rowptrsHost.extent(0) - 1) : 0;
+  for (ordinal_type i = 0; i < numRows; i++) {
+    size_type rowBegin = rowptrsHost(i);
+    size_type rowEnd   = rowptrsHost(i + 1);
+    for (size_type j = rowBegin; j < rowEnd - 1; j++) {
+      ordinal_type swapRange = rowEnd - j;
+      size_type swapOffset   = j + (rand() % swapRange);
+      std::swap(entriesHost(j), entriesHost(swapOffset));
+      std::swap(valuesHost(j), valuesHost(swapOffset));
+    }
+  }
+  Kokkos::deep_copy(entries, entriesHost);
+  Kokkos::deep_copy(values, valuesHost);
+}
+
 }  // namespace Test
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_Dense.hpp b/unit_test/batched/dense/Test_Batched_Dense.hpp
index 47a1cf1fd4..edf573c633 100644
--- a/unit_test/batched/dense/Test_Batched_Dense.hpp
+++ b/unit_test/batched/dense/Test_Batched_Dense.hpp
@@ -16,15 +16,14 @@
 #include "Test_Batched_SerialGemv.hpp"
 #include "Test_Batched_SerialGemv_Real.hpp"
 #include "Test_Batched_SerialGemv_Complex.hpp"
+#include "Test_Batched_SerialGesv.hpp"
+#include "Test_Batched_SerialGesv_Real.hpp"
 #include "Test_Batched_SerialInverseLU.hpp"
 #include "Test_Batched_SerialInverseLU_Real.hpp"
 #include "Test_Batched_SerialInverseLU_Complex.hpp"
 #include "Test_Batched_SerialLU.hpp"
 #include "Test_Batched_SerialLU_Real.hpp"
 #include "Test_Batched_SerialLU_Complex.hpp"
-#include "Test_Batched_SerialMatUtil.hpp"
-#include "Test_Batched_SerialMatUtil_Real.hpp"
-#include "Test_Batched_SerialMatUtil_Complex.hpp"
 #include "Test_Batched_SerialSolveLU.hpp"
 #include "Test_Batched_SerialSolveLU_Real.hpp"
 #include "Test_Batched_SerialSolveLU_Complex.hpp"
@@ -52,15 +51,14 @@
 #include "Test_Batched_TeamGemv.hpp"
 #include "Test_Batched_TeamGemv_Real.hpp"
 #include "Test_Batched_TeamGemv_Complex.hpp"
+#include "Test_Batched_TeamGesv.hpp"
+#include "Test_Batched_TeamGesv_Real.hpp"
 #include "Test_Batched_TeamInverseLU.hpp"
 #include "Test_Batched_TeamInverseLU_Real.hpp"
 #include "Test_Batched_TeamInverseLU_Complex.hpp"
 #include "Test_Batched_TeamLU.hpp"
 #include "Test_Batched_TeamLU_Real.hpp"
 #include "Test_Batched_TeamLU_Complex.hpp"
-#include "Test_Batched_TeamMatUtil.hpp"
-#include "Test_Batched_TeamMatUtil_Real.hpp"
-#include "Test_Batched_TeamMatUtil_Complex.hpp"
 #include "Test_Batched_TeamSolveLU.hpp"
 #include "Test_Batched_TeamSolveLU_Real.hpp"
 #include "Test_Batched_TeamSolveLU_Complex.hpp"
@@ -80,6 +78,8 @@
 #include "Test_Batched_TeamVectorGemm.hpp"
 #include "Test_Batched_TeamVectorGemm_Real.hpp"
 #include "Test_Batched_TeamVectorGemm_Complex.hpp"
+#include "Test_Batched_TeamVectorGesv.hpp"
+#include "Test_Batched_TeamVectorGesv_Real.hpp"
 #include "Test_Batched_TeamVectorQR.hpp"
 #include "Test_Batched_TeamVectorQR_Real.hpp"
 #include "Test_Batched_TeamVectorQR_WithColumnPivoting.hpp"
diff --git a/unit_test/batched/dense/Test_Batched_DenseUtils.hpp b/unit_test/batched/dense/Test_Batched_DenseUtils.hpp
new file mode 100644
index 0000000000..d355159a9a
--- /dev/null
+++ b/unit_test/batched/dense/Test_Batched_DenseUtils.hpp
@@ -0,0 +1,44 @@
+#ifndef TEST_BATCHED_DENSE_HELPER_HPP
+#define TEST_BATCHED_DENSE_HELPER_HPP
+
+namespace KokkosBatched {
+template <typename MatrixViewType, typename VectorViewType>
+void create_tridiagonal_batched_matrices(const MatrixViewType &A,
+                                         const VectorViewType &B) {
+  Kokkos::Random_XorShift64_Pool<
+      typename VectorViewType::device_type::execution_space>
+      random(13718);
+  Kokkos::fill_random(
+      B, random,
+      Kokkos::reduction_identity<typename VectorViewType::value_type>::prod());
+
+  auto A_host = Kokkos::create_mirror_view(A);
+
+  const int N       = A.extent(0);
+  const int BlkSize = A.extent(1);
+
+  for (int l = 0; l < N; ++l) {
+    for (int i = 0; i < BlkSize; ++i) {
+      for (int j = i; j < BlkSize; ++j) {
+        if (i == j)
+          A_host(l, i, j) = typename VectorViewType::value_type(2.0);
+        else if (i == j - 1) {
+          A_host(l, i, j) = typename VectorViewType::value_type(-1.0);
+          A_host(l, j, i) = typename VectorViewType::value_type(-1.0);
+        } else {
+          A_host(l, i, j) = typename VectorViewType::value_type(0.0);
+          A_host(l, j, i) = typename VectorViewType::value_type(0.0);
+        }
+      }
+    }
+  }
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(A, A_host);
+
+  Kokkos::fence();
+}
+}  // namespace KokkosBatched
+
+#endif  // TEST_BATCHED_DENSE_HELPER_HPP
diff --git a/unit_test/batched/dense/Test_Batched_SerialGesv.hpp b/unit_test/batched/dense/Test_Batched_SerialGesv.hpp
new file mode 100644
index 0000000000..233d6bedf3
--- /dev/null
+++ b/unit_test/batched/dense/Test_Batched_SerialGesv.hpp
@@ -0,0 +1,141 @@
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+
+#include "KokkosBatched_Gesv.hpp"
+#include "KokkosBatched_Dot.hpp"
+#include "KokkosBatched_Gemv_Decl.hpp"
+
+#include "KokkosKernels_TestUtils.hpp"
+
+#include "Test_Batched_DenseUtils.hpp"
+
+using namespace KokkosBatched;
+
+namespace Test {
+namespace Gesv {
+
+template <typename DeviceType, typename MatrixType, typename VectorType,
+          typename AlgoTagType>
+struct Functor_TestBatchedSerialGesv {
+  const MatrixType _A;
+  const MatrixType _tmp;
+  const VectorType _X;
+  const VectorType _B;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedSerialGesv(const MatrixType &A, const MatrixType &tmp,
+                                const VectorType &X, const VectorType &B)
+      : _A(A), _tmp(tmp), _X(X), _B(B) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int k) const {
+    auto A   = Kokkos::subview(_A, k, Kokkos::ALL, Kokkos::ALL);
+    auto x   = Kokkos::subview(_X, k, Kokkos::ALL);
+    auto b   = Kokkos::subview(_B, k, Kokkos::ALL);
+    auto tmp = Kokkos::subview(_tmp, k, Kokkos::ALL, Kokkos::ALL);
+
+    KokkosBatched::SerialGesv<AlgoTagType>::invoke(A, x, b, tmp);
+  }
+
+  inline void run() {
+    typedef typename VectorType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::SerialGesv");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType> policy(0, _X.extent(0));
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename MatrixType, typename VectorType,
+          typename AlgoTagType>
+void impl_test_batched_gesv(const int N, const int BlkSize) {
+  typedef typename MatrixType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  using MagnitudeType =
+      typename Kokkos::Details::ArithTraits<value_type>::mag_type;
+  using NormViewType =
+      Kokkos::View<MagnitudeType *, Kokkos::LayoutLeft, DeviceType>;
+
+  NormViewType sqr_norm_j("sqr_norm_j", N);
+  auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j);
+
+  MatrixType A("A", N, BlkSize, BlkSize), A2("A", N, BlkSize, BlkSize),
+      tmp("tmp", N, BlkSize, BlkSize + 4);
+  VectorType B("b", N, BlkSize), B2("b", N, BlkSize), X("x", N, BlkSize);
+
+  create_tridiagonal_batched_matrices(A, B);
+  Kokkos::deep_copy(A2, A);
+  Kokkos::deep_copy(B2, B);
+
+  auto A_host = Kokkos::create_mirror_view(A2);
+  auto B_host = Kokkos::create_mirror_view(B2);
+  auto X_host = Kokkos::create_mirror_view(X);
+
+  Kokkos::deep_copy(A_host, A2);
+  Kokkos::deep_copy(B_host, B2);
+
+  Kokkos::fence();
+
+  Functor_TestBatchedSerialGesv<DeviceType, MatrixType, VectorType,
+                                AlgoTagType>(A, tmp, X, B)
+      .run();
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(X_host, X);
+
+  for (int l = 0; l < N; ++l)
+    KokkosBatched::SerialGemv<Trans::NoTranspose,
+                              KokkosBatched::Algo::Gemv::Unblocked>::
+        invoke(-1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL),
+               Kokkos::subview(X_host, l, Kokkos::ALL), 1,
+               Kokkos::subview(B_host, l, Kokkos::ALL));
+
+  KokkosBatched::SerialDot<Trans::NoTranspose>::invoke(B_host, B_host,
+                                                       sqr_norm_j_host);
+
+  const MagnitudeType eps = 1.0e3 * ats::epsilon();
+
+  for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(sqr_norm_j_host(l), 0, eps);
+}
+}  // namespace Gesv
+}  // namespace Test
+
+template <typename DeviceType, typename ValueType, typename AlgoTagType>
+int test_batched_gesv() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        MatrixType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType>
+        VectorType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::Gesv::impl_test_batched_gesv<DeviceType, MatrixType, VectorType,
+                                         AlgoTagType>(1024, i);
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        MatrixType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        VectorType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::Gesv::impl_test_batched_gesv<DeviceType, MatrixType, VectorType,
+                                         AlgoTagType>(1024, i);
+    }
+  }
+#endif
+
+  return 0;
+}
diff --git a/unit_test/batched/dense/Test_Batched_SerialGesv_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialGesv_Real.hpp
new file mode 100644
index 0000000000..84a630efa3
--- /dev/null
+++ b/unit_test/batched/dense/Test_Batched_SerialGesv_Real.hpp
@@ -0,0 +1,19 @@
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, batched_scalar_serial_gesv_static_pivoting_float) {
+  test_batched_gesv<TestExecSpace, float,
+                    KokkosBatched::Gesv::StaticPivoting>();
+}
+TEST_F(TestCategory, batched_scalar_serial_gesv_no_pivoting_float) {
+  test_batched_gesv<TestExecSpace, float, KokkosBatched::Gesv::NoPivoting>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, batched_scalar_serial_gesv_static_pivoting_double) {
+  test_batched_gesv<TestExecSpace, double,
+                    KokkosBatched::Gesv::StaticPivoting>();
+}
+TEST_F(TestCategory, batched_scalar_serial_gesv_no_pivoting_double) {
+  test_batched_gesv<TestExecSpace, double, KokkosBatched::Gesv::NoPivoting>();
+}
+#endif
diff --git a/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp b/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp
deleted file mode 100644
index f9a58f5442..0000000000
--- a/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp
+++ /dev/null
@@ -1,165 +0,0 @@
-/// \author Kyungjoo Kim (kyukim@sandia.gov)
-
-#include "gtest/gtest.h"
-#include "Kokkos_Core.hpp"
-#include "Kokkos_Random.hpp"
-
-#include "KokkosBatched_Set_Decl.hpp"
-#include "KokkosBatched_Set_Impl.hpp"
-
-#include "KokkosBatched_Scale_Decl.hpp"
-#include "KokkosBatched_Scale_Impl.hpp"
-
-#include "KokkosKernels_TestUtils.hpp"
-
-using namespace KokkosBatched;
-
-namespace Test {
-
-enum : int { BatchedSet = 0, BatchedScale = 1 };
-
-struct KokkosKernelTag {};
-struct NaiveTag {};
-
-template <typename DeviceType, typename ViewType, typename ScalarType,
-          typename AlgoTagType, int TestID>
-struct Functor_TestBatchedSerialMatUtil {
-  ScalarType _alpha;
-  ViewType _a;
-
-  KOKKOS_INLINE_FUNCTION
-  Functor_TestBatchedSerialMatUtil(const ScalarType alpha, const ViewType &a)
-      : _alpha(alpha), _a(a) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const KokkosKernelTag &, const int i) const {
-    auto A = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL());
-    switch (TestID) {
-      case BatchedSet: SerialSet ::invoke(_alpha, A); break;
-      case BatchedScale: SerialScale::invoke(_alpha, A); break;
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const NaiveTag &, const int k) const {
-    // MD Note: changing because of the error with -werror
-    auto A      = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-    const int m = A.extent(0), n = A.extent(1);
-    switch (TestID) {
-      case BatchedSet: {
-        for (int i = 0; i < m; ++i)
-          for (int j = 0; j < n; ++j) A(i, j) = _alpha;
-        break;
-      }
-      case BatchedScale: {
-        for (int i = 0; i < m; ++i)
-          for (int j = 0; j < n; ++j) A(i, j) *= _alpha;
-        break;
-      }
-    }
-  }
-
-  inline int run() {
-    typedef typename ViewType::value_type value_type;
-    std::string name_region("KokkosBatched::Test::SerialMatUtil");
-    const std::string name_value_type = Test::value_type_name<value_type>();
-    std::string name_work_tag =
-        (std::is_same<AlgoTagType, KokkosKernelTag>::value
-             ? "::KokkosBatched"
-             : std::is_same<AlgoTagType, NaiveTag>::value ? "::Naive"
-                                                          : "::UnknownWorkTag");
-    std::string name_test_id =
-        (TestID == BatchedSet
-             ? "Set"
-             : TestID == BatchedScale ? "Scale" : "UnknownTest");
-    std::string name =
-        name_region + name_value_type + name_work_tag + name_test_id;
-    Kokkos::Profiling::pushRegion(name.c_str());
-    Kokkos::RangePolicy<DeviceType, AlgoTagType> policy(0, _a.extent(0));
-    Kokkos::parallel_for(name.c_str(), policy, *this);
-    Kokkos::Profiling::popRegion();
-    return 0;
-  }
-};
-
-template <typename DeviceType, typename ViewType, typename ScalarType,
-          int TestID>
-void impl_test_batched_matutil(const int N, const int BlkSize) {
-  /// typedefs
-  typedef typename ViewType::value_type value_type;
-  typedef Kokkos::Details::ArithTraits<value_type> ats;
-
-  /// radomized input testing views
-  const ScalarType alpha = 11.1;
-  ViewType a("a", N, BlkSize, BlkSize);
-  ViewType b("b", N, BlkSize, BlkSize);
-
-  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
-      13718);
-  Kokkos::fill_random(a, random, value_type(1.0));
-
-  Kokkos::fence();
-
-  Kokkos::deep_copy(b, a);
-
-  /// test body
-  Functor_TestBatchedSerialMatUtil<DeviceType, ViewType, ScalarType, NaiveTag,
-                                   TestID>(alpha, a)
-      .run();
-  Functor_TestBatchedSerialMatUtil<DeviceType, ViewType, ScalarType,
-                                   KokkosKernelTag, TestID>(alpha, b)
-      .run();
-
-  Kokkos::fence();
-
-  /// for comparison send it to host
-  typename ViewType::HostMirror a_host = Kokkos::create_mirror_view(a);
-  typename ViewType::HostMirror b_host = Kokkos::create_mirror_view(b);
-
-  Kokkos::deep_copy(a_host, a);
-  Kokkos::deep_copy(b_host, b);
-
-  /// check a = b
-  typename ats::mag_type eps =
-      100 * std::numeric_limits<typename ats::mag_type>::epsilon();
-  for (int k = 0; k < N; ++k)
-    for (int i = 0; i < BlkSize; ++i)
-      for (int j = 0; j < BlkSize; ++j)
-        EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps);
-}
-}  // namespace Test
-
-template <typename DeviceType, typename ValueType, typename ScalarType,
-          int TestID>
-int test_batched_matutil() {
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
-  {
-    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
-        ViewType;
-    Test::impl_test_batched_matutil<DeviceType, ViewType, ScalarType, TestID>(
-        0, 10);
-    Test::impl_test_batched_matutil<DeviceType, ViewType, ScalarType, TestID>(
-        10, 15);
-    Test::impl_test_batched_matutil<DeviceType, ViewType, ScalarType, TestID>(
-        1024, 9);
-    Test::impl_test_batched_matutil<DeviceType, ViewType, ScalarType, TestID>(
-        132231, 3);
-  }
-#endif
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
-  {
-    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
-        ViewType;
-    Test::impl_test_batched_matutil<DeviceType, ViewType, ScalarType, TestID>(
-        0, 10);
-    Test::impl_test_batched_matutil<DeviceType, ViewType, ScalarType, TestID>(
-        10, 15);
-    Test::impl_test_batched_matutil<DeviceType, ViewType, ScalarType, TestID>(
-        1024, 9);
-    Test::impl_test_batched_matutil<DeviceType, ViewType, ScalarType, TestID>(
-        132231, 3);
-  }
-#endif
-
-  return 0;
-}
diff --git a/unit_test/batched/dense/Test_Batched_SerialMatUtil_Complex.hpp b/unit_test/batched/dense/Test_Batched_SerialMatUtil_Complex.hpp
deleted file mode 100644
index 055a0cae62..0000000000
--- a/unit_test/batched/dense/Test_Batched_SerialMatUtil_Complex.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
-TEST_F(TestCategory, batched_scalar_serial_set_dcomplex_dcomplex) {
-  test_batched_matutil<TestExecSpace, Kokkos::complex<double>,
-                       Kokkos::complex<double>, ::Test::BatchedSet>();
-}
-TEST_F(TestCategory, batched_scalar_serial_scale_dcomplex_dcomplex) {
-  test_batched_matutil<TestExecSpace, Kokkos::complex<double>,
-                       Kokkos::complex<double>, ::Test::BatchedScale>();
-}
-TEST_F(TestCategory, batched_scalar_serial_set_dcomplex_double) {
-  test_batched_matutil<TestExecSpace, Kokkos::complex<double>, double,
-                       ::Test::BatchedSet>();
-}
-TEST_F(TestCategory, batched_scalar_serial_scale_dcomplex_double) {
-  test_batched_matutil<TestExecSpace, Kokkos::complex<double>, double,
-                       ::Test::BatchedScale>();
-}
-#endif
diff --git a/unit_test/batched/dense/Test_Batched_SerialMatUtil_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialMatUtil_Real.hpp
deleted file mode 100644
index c1644f9798..0000000000
--- a/unit_test/batched/dense/Test_Batched_SerialMatUtil_Real.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-
-#if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F(TestCategory, batched_scalar_serial_set_float_float) {
-  test_batched_matutil<TestExecSpace, float, float, ::Test::BatchedSet>();
-}
-TEST_F(TestCategory, batched_scalar_serial_scale_float_float) {
-  test_batched_matutil<TestExecSpace, float, float, ::Test::BatchedScale>();
-}
-#endif
-
-#if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F(TestCategory, batched_scalar_serial_set_double_double) {
-  test_batched_matutil<TestExecSpace, double, double, ::Test::BatchedSet>();
-}
-TEST_F(TestCategory, batched_scalar_serial_scale_double_double) {
-  test_batched_matutil<TestExecSpace, double, double, ::Test::BatchedScale>();
-}
-#endif
diff --git a/unit_test/batched/dense/Test_Batched_SerialSVD.hpp b/unit_test/batched/dense/Test_Batched_SerialSVD.hpp
index 57ec7f645b..d30da1726c 100644
--- a/unit_test/batched/dense/Test_Batched_SerialSVD.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialSVD.hpp
@@ -31,7 +31,7 @@ double simpleNorm2(const Vector& v) {
     double m = KAT::abs(vhost(i));
     d += m * m;
   }
-  return Kokkos::Experimental::sqrt(d);
+  return std::sqrt(d);
 }
 
 template <typename V1, typename V2>
diff --git a/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp
index cdcd00cff2..3ffc34db23 100644
--- a/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp
@@ -5,19 +5,21 @@
 TEST_F(TestCategory, batched_scalar_team_gemv_nt_dcomplex_dcomplex) {
   typedef ::Test::TeamGemv::ParamTag<Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace, Kokkos::complex<double>,
-                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
+  test_batched_team_gemv<TestExecSpace, Kokkos::complex<double>,
+                         Kokkos::complex<double>, param_tag_type,
+                         algo_tag_type>();
 }
 TEST_F(TestCategory, batched_scalar_team_gemv_t_dcomplex_dcomplex) {
   typedef ::Test::TeamGemv::ParamTag<Trans::Transpose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace, Kokkos::complex<double>,
-                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
+  test_batched_team_gemv<TestExecSpace, Kokkos::complex<double>,
+                         Kokkos::complex<double>, param_tag_type,
+                         algo_tag_type>();
 }
 // TEST_F( TestCategory, batched_scalar_team_gemv_ct_dcomplex_dcomplex ) {
 //   typedef ::Test::TeamGemv::ParamTag<Trans::ConjTranspose> param_tag_type;
 //   typedef Algo::Gemv::Blocked algo_tag_type;
-//   test_batched_gemv<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+//   test_batched_team_gemv<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
 // }
 
 /// dcomplex, double
@@ -25,19 +27,19 @@ TEST_F(TestCategory, batched_scalar_team_gemv_t_dcomplex_dcomplex) {
 TEST_F(TestCategory, batched_scalar_team_gemv_nt_dcomplex_double) {
   typedef ::Test::TeamGemv::ParamTag<Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace, Kokkos::complex<double>, double,
-                    param_tag_type, algo_tag_type>();
+  test_batched_team_gemv<TestExecSpace, Kokkos::complex<double>, double,
+                         param_tag_type, algo_tag_type>();
 }
 TEST_F(TestCategory, batched_scalar_team_gemv_t_dcomplex_double) {
   typedef ::Test::TeamGemv::ParamTag<Trans::Transpose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace, Kokkos::complex<double>, double,
-                    param_tag_type, algo_tag_type>();
+  test_batched_team_gemv<TestExecSpace, Kokkos::complex<double>, double,
+                         param_tag_type, algo_tag_type>();
 }
 // TEST_F( TestCategory, batched_scalar_team_gemv_ct_dcomplex_double ) {
 //   typedef ::Test::TeamGemv::ParamTag<Trans::ConjTranspose> param_tag_type;
 //   typedef Algo::Gemv::Blocked algo_tag_type;
-//   test_batched_gemv<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+//   test_batched_team_gemv<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
 // }
 
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp
index 8401075f47..2c4db11b2d 100644
--- a/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp
@@ -3,14 +3,14 @@
 TEST_F(TestCategory, batched_scalar_team_gemv_nt_float_float) {
   typedef ::Test::TeamGemv::ParamTag<Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace, float, float, param_tag_type,
-                    algo_tag_type>();
+  test_batched_team_gemv<TestExecSpace, float, float, param_tag_type,
+                         algo_tag_type>();
 }
 TEST_F(TestCategory, batched_scalar_team_gemv_t_float_float) {
   typedef ::Test::TeamGemv::ParamTag<Trans::Transpose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace, float, float, param_tag_type,
-                    algo_tag_type>();
+  test_batched_team_gemv<TestExecSpace, float, float, param_tag_type,
+                         algo_tag_type>();
 }
 #endif
 
@@ -18,13 +18,13 @@ TEST_F(TestCategory, batched_scalar_team_gemv_t_float_float) {
 TEST_F(TestCategory, batched_scalar_team_gemv_nt_double_double) {
   typedef ::Test::TeamGemv::ParamTag<Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace, double, double, param_tag_type,
-                    algo_tag_type>();
+  test_batched_team_gemv<TestExecSpace, double, double, param_tag_type,
+                         algo_tag_type>();
 }
 TEST_F(TestCategory, batched_scalar_team_gemv_t_double_double) {
   typedef ::Test::TeamGemv::ParamTag<Trans::Transpose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace, double, double, param_tag_type,
-                    algo_tag_type>();
+  test_batched_team_gemv<TestExecSpace, double, double, param_tag_type,
+                         algo_tag_type>();
 }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamGesv.hpp b/unit_test/batched/dense/Test_Batched_TeamGesv.hpp
new file mode 100644
index 0000000000..8f6bcf9f9d
--- /dev/null
+++ b/unit_test/batched/dense/Test_Batched_TeamGesv.hpp
@@ -0,0 +1,152 @@
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+
+#include "KokkosBatched_Gesv.hpp"
+#include "KokkosBatched_Dot.hpp"
+#include "KokkosBatched_Gemv_Decl.hpp"
+
+#include "KokkosKernels_TestUtils.hpp"
+
+#include "Test_Batched_DenseUtils.hpp"
+
+using namespace KokkosBatched;
+
+namespace Test {
+namespace TeamGesv {
+
+template <typename DeviceType, typename MatrixType, typename VectorType,
+          typename AlgoTagType>
+struct Functor_TestBatchedTeamGesv {
+  const MatrixType _A;
+  const VectorType _X;
+  const VectorType _B;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamGesv(const MatrixType &A, const VectorType &X,
+                              const VectorType &B)
+      : _A(A), _X(X), _B(B) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    const int matrix_id = static_cast<int>(member.league_rank());
+    auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL);
+    auto x = Kokkos::subview(_X, matrix_id, Kokkos::ALL);
+    auto b = Kokkos::subview(_B, matrix_id, Kokkos::ALL);
+
+    member.team_barrier();
+    KokkosBatched::TeamGesv<MemberType, AlgoTagType>::invoke(member, A, x, b);
+    member.team_barrier();
+  }
+
+  inline void run() {
+    typedef typename VectorType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::TeamGesv");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::TeamPolicy<DeviceType> policy(_X.extent(0), Kokkos::AUTO(),
+                                          Kokkos::AUTO());
+
+    using MatrixViewType =
+        Kokkos::View<typename MatrixType::non_const_value_type **,
+                     typename MatrixType::array_layout,
+                     typename MatrixType::execution_space>;
+
+    const int n    = _A.extent(1);
+    size_t bytes_0 = MatrixViewType::shmem_size(n, n + 4);
+    policy.set_scratch_size(0, Kokkos::PerTeam(bytes_0));
+
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename MatrixType, typename VectorType,
+          typename AlgoTagType>
+void impl_test_batched_gesv(const int N, const int BlkSize) {
+  typedef typename MatrixType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  using MagnitudeType =
+      typename Kokkos::Details::ArithTraits<value_type>::mag_type;
+  using NormViewType =
+      Kokkos::View<MagnitudeType *, Kokkos::LayoutLeft, DeviceType>;
+
+  NormViewType sqr_norm_j("sqr_norm_j", N);
+  auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j);
+
+  MatrixType A("A", N, BlkSize, BlkSize), A2("A", N, BlkSize, BlkSize);
+  VectorType B("b", N, BlkSize), B2("b", N, BlkSize), X("x", N, BlkSize);
+
+  create_tridiagonal_batched_matrices(A, B);
+  Kokkos::deep_copy(A2, A);
+  Kokkos::deep_copy(B2, B);
+
+  auto A_host = Kokkos::create_mirror_view(A2);
+  auto B_host = Kokkos::create_mirror_view(B2);
+  auto X_host = Kokkos::create_mirror_view(X);
+
+  Kokkos::deep_copy(A_host, A2);
+  Kokkos::deep_copy(B_host, B2);
+
+  Kokkos::fence();
+
+  Functor_TestBatchedTeamGesv<DeviceType, MatrixType, VectorType, AlgoTagType>(
+      A, X, B)
+      .run();
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(X_host, X);
+
+  for (int l = 0; l < N; ++l)
+    KokkosBatched::SerialGemv<Trans::NoTranspose,
+                              KokkosBatched::Algo::Gemv::Unblocked>::
+        invoke(-1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL),
+               Kokkos::subview(X_host, l, Kokkos::ALL), 1,
+               Kokkos::subview(B_host, l, Kokkos::ALL));
+
+  KokkosBatched::SerialDot<Trans::NoTranspose>::invoke(B_host, B_host,
+                                                       sqr_norm_j_host);
+
+  const MagnitudeType eps = 1.0e3 * ats::epsilon();
+
+  for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(sqr_norm_j_host(l), 0, eps);
+}
+}  // namespace TeamGesv
+}  // namespace Test
+
+template <typename DeviceType, typename ValueType, typename AlgoTagType>
+int test_batched_team_gesv() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        MatrixType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType>
+        VectorType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamGesv::impl_test_batched_gesv<DeviceType, MatrixType, VectorType,
+                                             AlgoTagType>(1024, i);
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        MatrixType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        VectorType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamGesv::impl_test_batched_gesv<DeviceType, MatrixType, VectorType,
+                                             AlgoTagType>(1024, i);
+    }
+  }
+#endif
+
+  return 0;
+}
diff --git a/unit_test/batched/dense/Test_Batched_TeamGesv_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamGesv_Real.hpp
new file mode 100644
index 0000000000..8dca15a4a2
--- /dev/null
+++ b/unit_test/batched/dense/Test_Batched_TeamGesv_Real.hpp
@@ -0,0 +1,21 @@
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, batched_scalar_team_gesv_static_pivoting_float) {
+  test_batched_team_gesv<TestExecSpace, float,
+                         KokkosBatched::Gesv::StaticPivoting>();
+}
+TEST_F(TestCategory, batched_scalar_team_gesv_no_pivoting_float) {
+  test_batched_team_gesv<TestExecSpace, float,
+                         KokkosBatched::Gesv::NoPivoting>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, batched_scalar_team_gesv_static_pivoting_double) {
+  test_batched_team_gesv<TestExecSpace, double,
+                         KokkosBatched::Gesv::StaticPivoting>();
+}
+TEST_F(TestCategory, batched_scalar_team_gesv_no_pivoting_double) {
+  test_batched_team_gesv<TestExecSpace, double,
+                         KokkosBatched::Gesv::NoPivoting>();
+}
+#endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp b/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp
deleted file mode 100644
index 16879444f7..0000000000
--- a/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/// \author Kyungjoo Kim (kyukim@sandia.gov)
-
-#include "gtest/gtest.h"
-#include "Kokkos_Core.hpp"
-#include "Kokkos_Random.hpp"
-
-#include "KokkosBatched_Set_Decl.hpp"
-#include "KokkosBatched_Set_Impl.hpp"
-
-#include "KokkosBatched_Scale_Decl.hpp"
-#include "KokkosBatched_Scale_Impl.hpp"
-
-#include "KokkosKernels_TestUtils.hpp"
-
-using namespace KokkosBatched;
-
-namespace Test {
-namespace TeamMatUtil {
-
-enum : int { BatchedSet = 0, BatchedScale = 1 };
-
-struct KokkosKernelTag {};
-struct NaiveTag {};
-
-template <typename DeviceType, typename ViewType, typename ScalarType,
-          typename AlgoTagType, int TestID>
-struct Functor_TestBatchedTeamMatUtil {
-  ScalarType _alpha;
-  ViewType _a;
-
-  KOKKOS_INLINE_FUNCTION
-  Functor_TestBatchedTeamMatUtil(const ScalarType alpha, const ViewType &a)
-      : _alpha(alpha), _a(a) {}
-
-  template <typename MemberType>
-  KOKKOS_INLINE_FUNCTION void operator()(const KokkosKernelTag &,
-                                         const MemberType &member) const {
-    const int i = member.league_rank();
-    auto A      = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL());
-    switch (TestID) {
-      case BatchedSet: TeamSet<MemberType>::invoke(member, _alpha, A); break;
-      case BatchedScale:
-        TeamScale<MemberType>::invoke(member, _alpha, A);
-        break;
-    }
-  }
-
-  template <typename MemberType>
-  KOKKOS_INLINE_FUNCTION void operator()(const NaiveTag &,
-                                         const MemberType &member) const {
-    if (member.team_rank() == 0) {
-      const int k = member.league_rank();
-      auto A      = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      const int m = A.extent(0), n = A.extent(1);
-      switch (TestID) {
-        case BatchedSet: {
-          for (int i = 0; i < m; ++i)
-            for (int j = 0; j < n; ++j) A(i, j) = _alpha;
-          break;
-        }
-        case BatchedScale: {
-          for (int i = 0; i < m; ++i)
-            for (int j = 0; j < n; ++j) A(i, j) *= _alpha;
-          break;
-        }
-      }
-    }
-  }
-
-  inline int run() {
-    typedef typename ViewType::value_type value_type;
-    std::string name_region("KokkosBatched::Test::SerialMatUtil");
-    const std::string name_value_type = Test::value_type_name<value_type>();
-    std::string name_work_tag =
-        (std::is_same<AlgoTagType, KokkosKernelTag>::value
-             ? "::KokkosBatched"
-             : std::is_same<AlgoTagType, NaiveTag>::value ? "::Naive"
-                                                          : "::UnknownWorkTag");
-    std::string name_test_id =
-        (TestID == BatchedSet
-             ? "Set"
-             : TestID == BatchedScale ? "Scale" : "UnknownTest");
-    std::string name =
-        name_region + name_value_type + name_work_tag + name_test_id;
-    Kokkos::Profiling::pushRegion(name.c_str());
-
-    const int league_size = _a.extent(0);
-    Kokkos::TeamPolicy<DeviceType, AlgoTagType> policy(league_size,
-                                                       Kokkos::AUTO);
-    Kokkos::parallel_for(name.c_str(), policy, *this);
-    Kokkos::Profiling::popRegion();
-
-    return 0;
-  }
-};
-
-template <typename DeviceType, typename ViewType, typename ScalarType,
-          int TestID>
-void impl_test_batched_matutil(const int N, const int BlkSize) {
-  /// typedefs
-  typedef typename ViewType::value_type value_type;
-  typedef Kokkos::Details::ArithTraits<value_type> ats;
-
-  /// radomized input testing views
-  const ScalarType alpha = 11.1;
-  ViewType a("a", N, BlkSize, BlkSize);
-  ViewType b("b", N, BlkSize, BlkSize);
-
-  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
-      13718);
-  Kokkos::fill_random(a, random, value_type(1.0));
-
-  Kokkos::fence();
-
-  Kokkos::deep_copy(b, a);
-
-  /// test body
-  Functor_TestBatchedTeamMatUtil<DeviceType, ViewType, ScalarType, NaiveTag,
-                                 TestID>(alpha, a)
-      .run();
-  Functor_TestBatchedTeamMatUtil<DeviceType, ViewType, ScalarType,
-                                 KokkosKernelTag, TestID>(alpha, b)
-      .run();
-
-  Kokkos::fence();
-
-  /// for comparison send it to host
-  typename ViewType::HostMirror a_host = Kokkos::create_mirror_view(a);
-  typename ViewType::HostMirror b_host = Kokkos::create_mirror_view(b);
-
-  Kokkos::deep_copy(a_host, a);
-  Kokkos::deep_copy(b_host, b);
-
-  /// check a = b
-  typename ats::mag_type eps =
-      100 * std::numeric_limits<typename ats::mag_type>::epsilon();
-  for (int k = 0; k < N; ++k)
-    for (int i = 0; i < BlkSize; ++i)
-      for (int j = 0; j < BlkSize; ++j)
-        EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps);
-}
-}  // namespace TeamMatUtil
-}  // namespace Test
-
-template <typename DeviceType, typename ValueType, typename ScalarType,
-          int TestID>
-int test_batched_team_matutil() {
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
-  {
-    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
-        ViewType;
-    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType, ViewType,
-                                                 ScalarType, TestID>(0, 10);
-    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType, ViewType,
-                                                 ScalarType, TestID>(10, 15);
-    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType, ViewType,
-                                                 ScalarType, TestID>(1024, 9);
-    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType, ViewType,
-                                                 ScalarType, TestID>(132231, 3);
-  }
-#endif
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
-  {
-    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
-        ViewType;
-    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType, ViewType,
-                                                 ScalarType, TestID>(0, 10);
-    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType, ViewType,
-                                                 ScalarType, TestID>(10, 15);
-    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType, ViewType,
-                                                 ScalarType, TestID>(1024, 9);
-    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType, ViewType,
-                                                 ScalarType, TestID>(132231, 3);
-  }
-#endif
-
-  return 0;
-}
diff --git a/unit_test/batched/dense/Test_Batched_TeamMatUtil_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamMatUtil_Complex.hpp
deleted file mode 100644
index 7f573354d8..0000000000
--- a/unit_test/batched/dense/Test_Batched_TeamMatUtil_Complex.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
-TEST_F(TestCategory, batched_scalar_team_set_dcomplex_dcomplex) {
-  test_batched_team_matutil<TestExecSpace, Kokkos::complex<double>,
-                            Kokkos::complex<double>, ::Test::BatchedSet>();
-}
-TEST_F(TestCategory, batched_scalar_team_scale_dcomplex_dcomplex) {
-  test_batched_team_matutil<TestExecSpace, Kokkos::complex<double>,
-                            Kokkos::complex<double>, ::Test::BatchedScale>();
-}
-TEST_F(TestCategory, batched_scalar_team_set_dcomplex_double) {
-  test_batched_team_matutil<TestExecSpace, Kokkos::complex<double>, double,
-                            ::Test::BatchedSet>();
-}
-TEST_F(TestCategory, batched_scalar_team_scale_dcomplex_double) {
-  test_batched_team_matutil<TestExecSpace, Kokkos::complex<double>, double,
-                            ::Test::BatchedScale>();
-}
-#endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamMatUtil_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamMatUtil_Real.hpp
deleted file mode 100644
index 1f13b79cca..0000000000
--- a/unit_test/batched/dense/Test_Batched_TeamMatUtil_Real.hpp
+++ /dev/null
@@ -1,21 +0,0 @@
-
-#if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F(TestCategory, batched_scalar_team_set_float_float) {
-  test_batched_team_matutil<TestExecSpace, float, float, ::Test::BatchedSet>();
-}
-TEST_F(TestCategory, batched_scalar_team_scale_float_float) {
-  test_batched_team_matutil<TestExecSpace, float, float,
-                            ::Test::BatchedScale>();
-}
-#endif
-
-#if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F(TestCategory, batched_scalar_team_set_double_double) {
-  test_batched_team_matutil<TestExecSpace, double, double,
-                            ::Test::BatchedSet>();
-}
-TEST_F(TestCategory, batched_scalar_team_scale_double_double) {
-  test_batched_team_matutil<TestExecSpace, double, double,
-                            ::Test::BatchedScale>();
-}
-#endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorGesv.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorGesv.hpp
new file mode 100644
index 0000000000..9ee05cb919
--- /dev/null
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorGesv.hpp
@@ -0,0 +1,155 @@
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+
+#include "KokkosBatched_Gesv.hpp"
+#include "KokkosBatched_Dot.hpp"
+#include "KokkosBatched_Gemv_Decl.hpp"
+
+#include "KokkosKernels_TestUtils.hpp"
+
+#include "Test_Batched_DenseUtils.hpp"
+
+using namespace KokkosBatched;
+
+namespace Test {
+namespace TeamVectorGesv {
+
+template <typename DeviceType, typename MatrixType, typename VectorType,
+          typename AlgoTagType>
+struct Functor_TestBatchedTeamVectorGesv {
+  const MatrixType _A;
+  const VectorType _X;
+  const VectorType _B;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamVectorGesv(const MatrixType &A, const VectorType &X,
+                                    const VectorType &B)
+      : _A(A), _X(X), _B(B) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    const int matrix_id = static_cast<int>(member.league_rank());
+    auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL);
+    auto x = Kokkos::subview(_X, matrix_id, Kokkos::ALL);
+    auto b = Kokkos::subview(_B, matrix_id, Kokkos::ALL);
+
+    member.team_barrier();
+    KokkosBatched::TeamVectorGesv<MemberType, AlgoTagType>::invoke(member, A, x,
+                                                                   b);
+    member.team_barrier();
+  }
+
+  inline void run() {
+    typedef typename VectorType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::TeamVectorGesv");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::TeamPolicy<DeviceType> policy(_X.extent(0), Kokkos::AUTO(),
+                                          Kokkos::AUTO());
+
+    using MatrixViewType =
+        Kokkos::View<typename MatrixType::non_const_value_type **,
+                     typename MatrixType::array_layout,
+                     typename MatrixType::execution_space>;
+
+    const int n    = _A.extent(1);
+    size_t bytes_0 = MatrixViewType::shmem_size(n, n + 4);
+    policy.set_scratch_size(0, Kokkos::PerTeam(bytes_0));
+
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename MatrixType, typename VectorType,
+          typename AlgoTagType>
+void impl_test_batched_gesv(const int N, const int BlkSize) {
+  typedef typename MatrixType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  using MagnitudeType =
+      typename Kokkos::Details::ArithTraits<value_type>::mag_type;
+  using NormViewType =
+      Kokkos::View<MagnitudeType *, Kokkos::LayoutLeft, DeviceType>;
+
+  NormViewType sqr_norm_j("sqr_norm_j", N);
+  auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j);
+
+  MatrixType A("A", N, BlkSize, BlkSize), A2("A", N, BlkSize, BlkSize);
+  VectorType B("b", N, BlkSize), B2("b", N, BlkSize), X("x", N, BlkSize);
+
+  create_tridiagonal_batched_matrices(A, B);
+  Kokkos::deep_copy(A2, A);
+  Kokkos::deep_copy(B2, B);
+
+  auto A_host = Kokkos::create_mirror_view(A2);
+  auto B_host = Kokkos::create_mirror_view(B2);
+  auto X_host = Kokkos::create_mirror_view(X);
+
+  Kokkos::deep_copy(A_host, A2);
+  Kokkos::deep_copy(B_host, B2);
+
+  Kokkos::fence();
+
+  Functor_TestBatchedTeamVectorGesv<DeviceType, MatrixType, VectorType,
+                                    AlgoTagType>(A, X, B)
+      .run();
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(X_host, X);
+
+  for (int l = 0; l < N; ++l)
+    KokkosBatched::SerialGemv<Trans::NoTranspose,
+                              KokkosBatched::Algo::Gemv::Unblocked>::
+        invoke(-1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL),
+               Kokkos::subview(X_host, l, Kokkos::ALL), 1,
+               Kokkos::subview(B_host, l, Kokkos::ALL));
+
+  KokkosBatched::SerialDot<Trans::NoTranspose>::invoke(B_host, B_host,
+                                                       sqr_norm_j_host);
+
+  const MagnitudeType eps = 1.0e3 * ats::epsilon();
+
+  for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(sqr_norm_j_host(l), 0, eps);
+}
+}  // namespace TeamVectorGesv
+}  // namespace Test
+
+template <typename DeviceType, typename ValueType, typename AlgoTagType>
+int test_batched_teamvector_gesv() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        MatrixType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType>
+        VectorType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamVectorGesv::impl_test_batched_gesv<DeviceType, MatrixType,
+                                                   VectorType, AlgoTagType>(
+          1024, i);
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        MatrixType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        VectorType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamVectorGesv::impl_test_batched_gesv<DeviceType, MatrixType,
+                                                   VectorType, AlgoTagType>(
+          1024, i);
+    }
+  }
+#endif
+
+  return 0;
+}
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorGesv_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorGesv_Real.hpp
new file mode 100644
index 0000000000..d83706718c
--- /dev/null
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorGesv_Real.hpp
@@ -0,0 +1,21 @@
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, batched_scalar_teamvector_gesv_static_pivoting_float) {
+  test_batched_teamvector_gesv<TestExecSpace, float,
+                               KokkosBatched::Gesv::NoPivoting>();
+}
+TEST_F(TestCategory, batched_scalar_teamvector_gesv_no_pivoting_float) {
+  test_batched_teamvector_gesv<TestExecSpace, float,
+                               KokkosBatched::Gesv::StaticPivoting>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, batched_scalar_teamvector_gesv_static_pivoting_double) {
+  test_batched_teamvector_gesv<TestExecSpace, double,
+                               KokkosBatched::Gesv::StaticPivoting>();
+}
+TEST_F(TestCategory, batched_scalar_teamvector_gesv_no_pivoting_double) {
+  test_batched_teamvector_gesv<TestExecSpace, double,
+                               KokkosBatched::Gesv::NoPivoting>();
+}
+#endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp
index 4ae4ee4133..80bc7b246a 100644
--- a/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp
@@ -4,7 +4,7 @@
 #include "Kokkos_Core.hpp"
 #include "Kokkos_Random.hpp"
 
-#include "KokkosBatched_Set_Decl.hpp"
+#include "KokkosBlas1_set.hpp"
 #include "KokkosBatched_Copy_Decl.hpp"
 #include "KokkosBatched_Gemv_Decl.hpp"
 #include "KokkosBatched_Trsv_Decl.hpp"
@@ -49,7 +49,7 @@ struct Functor_TestBatchedTeamVectorQR {
                          [&](const int &i) { aa(i, i) += add_this; });
 
     /// xx = 1
-    TeamVectorSet<MemberType>::invoke(member, one, xx);
+    KokkosBlas::TeamVectorSet<MemberType>::invoke(member, one, xx);
     member.team_barrier();
 
     /// bb = AA*xx
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp
index 3ae24bda84..72754a5e00 100644
--- a/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp
@@ -4,7 +4,6 @@
 #include "Kokkos_Core.hpp"
 #include "Kokkos_Random.hpp"
 
-#include "KokkosBatched_Set_Decl.hpp"
 #include "KokkosBatched_Copy_Decl.hpp"
 #include "KokkosBatched_ApplyPivot_Decl.hpp"
 #include "KokkosBatched_Gemv_Decl.hpp"
diff --git a/unit_test/batched/sparse/Test_Batched_SerialGMRES.hpp b/unit_test/batched/sparse/Test_Batched_SerialGMRES.hpp
new file mode 100644
index 0000000000..108a984a9d
--- /dev/null
+++ b/unit_test/batched/sparse/Test_Batched_SerialGMRES.hpp
@@ -0,0 +1,239 @@
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+#include "KokkosBatched_GMRES.hpp"
+#include "KokkosKernels_TestUtils.hpp"
+#include "KokkosBatched_CrsMatrix.hpp"
+#include "Test_Batched_SparseUtils.hpp"
+#include "KokkosBatched_JacobiPrec.hpp"
+
+using namespace KokkosBatched;
+
+namespace Test {
+namespace GMRES {
+
+template <typename DeviceType, typename ValuesViewType, typename IntView,
+          typename VectorViewType, typename KrylovHandleType>
+struct Functor_TestBatchedSerialGMRES {
+  const ValuesViewType _D;
+  const IntView _r;
+  const IntView _c;
+  const VectorViewType _X;
+  const VectorViewType _B;
+  const VectorViewType _Diag;
+  const int _N_team;
+  KrylovHandleType _handle;
+
+  Functor_TestBatchedSerialGMRES(const ValuesViewType &D, const IntView &r,
+                                 const IntView &c, const VectorViewType &X,
+                                 const VectorViewType &B,
+                                 const VectorViewType &diag, const int N_team,
+                                 KrylovHandleType &handle)
+      : _D(D),
+        _r(r),
+        _c(c),
+        _X(X),
+        _B(B),
+        _Diag(diag),
+        _N_team(N_team),
+        _handle(handle) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(const int k) const {
+    const int first_matrix = _handle.first_index(k);
+    const int last_matrix  = _handle.last_index(k);
+
+    auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+    auto diag = Kokkos::subview(
+        _Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL);
+    auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+    auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+
+    using Operator     = KokkosBatched::CrsMatrix<ValuesViewType, IntView>;
+    using PrecOperator = KokkosBatched::JacobiPrec<ValuesViewType>;
+
+    Operator A(d, _r, _c);
+    PrecOperator P(diag);
+    P.setComputedInverse();
+
+    KokkosBatched::SerialGMRES::template invoke<Operator, VectorViewType>(
+        A, b, x, P, _handle, k);
+  }
+
+  inline void run() {
+    typedef typename ValuesViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::SerialGMRES");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType> policy(0, _D.extent(0) / _N_team);
+
+    const int N                 = _D.extent(0);
+    const int n                 = _X.extent(1);
+    const int maximum_iteration = _handle.get_max_iteration();
+
+    _handle.set_ortho_strategy(0);
+    _handle.set_compute_last_residual(false);
+    _handle.set_tolerance(1e-8);
+
+    _handle.Arnoldi_view = typename KrylovHandleType::ArnoldiViewType(
+        "", N, maximum_iteration, n + maximum_iteration + 3);
+    _handle.tmp_view = typename KrylovHandleType::TemporaryViewType(
+        "", N, n + maximum_iteration + 3);
+
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename ValuesViewType, typename IntView,
+          typename VectorViewType>
+void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) {
+  typedef typename ValuesViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  const int nnz = (BlkSize - 2) * 3 + 2 * 2;
+
+  VectorViewType X("x0", N, BlkSize);
+  VectorViewType R("r0", N, BlkSize);
+  VectorViewType B("b", N, BlkSize);
+  ValuesViewType D("D", N, nnz);
+  ValuesViewType Diag("Diag", N, BlkSize);
+  IntView r("r", BlkSize + 1);
+  IntView c("c", nnz);
+
+  using ScalarType = typename ValuesViewType::non_const_value_type;
+  using Layout     = typename ValuesViewType::array_layout;
+  using EXSP       = typename ValuesViewType::execution_space;
+
+  using MagnitudeType =
+      typename Kokkos::Details::ArithTraits<ScalarType>::mag_type;
+  using NormViewType = Kokkos::View<MagnitudeType *, Layout, EXSP>;
+
+  using Norm2DViewType   = Kokkos::View<MagnitudeType **, Layout, EXSP>;
+  using Scalar3DViewType = Kokkos::View<ScalarType ***, Layout, EXSP>;
+  using IntViewType      = Kokkos::View<int *, Layout, EXSP>;
+
+  using KrylovHandleType =
+      KrylovHandle<Norm2DViewType, IntViewType, Scalar3DViewType>;
+
+  NormViewType sqr_norm_0("sqr_norm_0", N);
+  NormViewType sqr_norm_j("sqr_norm_j", N);
+
+  create_tridiagonal_batched_matrices(nnz, BlkSize, N, r, c, D, X, B);
+
+  {
+    auto diag_values_host = Kokkos::create_mirror_view(Diag);
+    auto values_host      = Kokkos::create_mirror_view(D);
+    auto row_ptr_host     = Kokkos::create_mirror_view(r);
+    auto colIndices_host  = Kokkos::create_mirror_view(c);
+
+    Kokkos::deep_copy(values_host, D);
+    Kokkos::deep_copy(row_ptr_host, r);
+    Kokkos::deep_copy(colIndices_host, c);
+
+    int current_index;
+    for (int i = 0; i < BlkSize; ++i) {
+      for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1);
+           ++current_index) {
+        if (colIndices_host(current_index) == i) break;
+      }
+      for (int j = 0; j < N; ++j)
+        diag_values_host(j, i) = values_host(j, current_index);
+    }
+
+    Kokkos::deep_copy(Diag, diag_values_host);
+  }
+
+  // Compute initial norm
+
+  Kokkos::deep_copy(R, B);
+
+  auto sqr_norm_0_host = Kokkos::create_mirror_view(sqr_norm_0);
+  auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j);
+  auto R_host          = Kokkos::create_mirror_view(R);
+  auto X_host          = Kokkos::create_mirror_view(X);
+  auto D_host          = Kokkos::create_mirror_view(D);
+  auto r_host          = Kokkos::create_mirror_view(r);
+  auto c_host          = Kokkos::create_mirror_view(c);
+
+  Kokkos::deep_copy(R, B);
+  Kokkos::deep_copy(R_host, R);
+  Kokkos::deep_copy(X_host, X);
+
+  Kokkos::deep_copy(c_host, c);
+  Kokkos::deep_copy(r_host, r);
+  Kokkos::deep_copy(D_host, D);
+
+  const int n_iterations = 10;
+  KrylovHandleType handle(N, N_team, n_iterations);
+
+  KokkosBatched::SerialSpmv<Trans::NoTranspose>::template invoke<
+      typename ValuesViewType::HostMirror, typename IntView::HostMirror,
+      typename VectorViewType::HostMirror, typename VectorViewType::HostMirror,
+      1>(-1, D_host, r_host, c_host, X_host, 1, R_host);
+  KokkosBatched::SerialDot<Trans::NoTranspose>::invoke(R_host, R_host,
+                                                       sqr_norm_0_host);
+  Functor_TestBatchedSerialGMRES<DeviceType, ValuesViewType, IntView,
+                                 VectorViewType, KrylovHandleType>(
+      D, r, c, X, B, Diag, N_team, handle)
+      .run();
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(R, B);
+  Kokkos::deep_copy(R_host, R);
+  Kokkos::deep_copy(X_host, X);
+
+  KokkosBatched::SerialSpmv<Trans::NoTranspose>::template invoke<
+      typename ValuesViewType::HostMirror, typename IntView::HostMirror,
+      typename VectorViewType::HostMirror, typename VectorViewType::HostMirror,
+      1>(-1, D_host, r_host, c_host, X_host, 1, R_host);
+  KokkosBatched::SerialDot<Trans::NoTranspose>::invoke(R_host, R_host,
+                                                       sqr_norm_j_host);
+
+  const MagnitudeType eps = 1.0e5 * ats::epsilon();
+
+  for (int l = 0; l < N; ++l)
+    EXPECT_NEAR_KK(
+        std::sqrt(sqr_norm_j_host(l)) / std::sqrt(sqr_norm_0_host(l)), 0, eps);
+}
+}  // namespace GMRES
+}  // namespace Test
+
+template <typename DeviceType, typename ValueType>
+int test_batched_serial_GMRES() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> ViewType;
+    typedef Kokkos::View<int *, Kokkos::LayoutLeft, DeviceType> IntView;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType>
+        VectorViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::GMRES::impl_test_batched_GMRES<DeviceType, ViewType, IntView,
+                                           VectorViewType>(1024, i, 2);
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    typedef Kokkos::View<int *, Kokkos::LayoutRight, DeviceType> IntView;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        VectorViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::GMRES::impl_test_batched_GMRES<DeviceType, ViewType, IntView,
+                                           VectorViewType>(1024, i, 2);
+    }
+  }
+#endif
+
+  return 0;
+}
diff --git a/unit_test/batched/sparse/Test_Batched_SerialGMRES_Real.hpp b/unit_test/batched/sparse/Test_Batched_SerialGMRES_Real.hpp
new file mode 100644
index 0000000000..acaa2f0ed2
--- /dev/null
+++ b/unit_test/batched/sparse/Test_Batched_SerialGMRES_Real.hpp
@@ -0,0 +1,12 @@
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, batched_scalar_serial_GMRES_float) {
+  test_batched_serial_GMRES<TestExecSpace, float>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, batched_scalar_serial_GMRES_double) {
+  test_batched_serial_GMRES<TestExecSpace, double>();
+}
+#endif
diff --git a/unit_test/batched/sparse/Test_Batched_Sparse.hpp b/unit_test/batched/sparse/Test_Batched_Sparse.hpp
index 4b36400d2e..36bfc43528 100644
--- a/unit_test/batched/sparse/Test_Batched_Sparse.hpp
+++ b/unit_test/batched/sparse/Test_Batched_Sparse.hpp
@@ -2,6 +2,8 @@
 #define TEST_BATCHED_SPARSE_HPP
 
 // Serial kernels
+#include "Test_Batched_SerialGMRES.hpp"
+#include "Test_Batched_SerialGMRES_Real.hpp"
 #include "Test_Batched_SerialSpmv.hpp"
 #include "Test_Batched_SerialSpmv_Real.hpp"
 
diff --git a/unit_test/batched/sparse/Test_Batched_TeamCG.hpp b/unit_test/batched/sparse/Test_Batched_TeamCG.hpp
index 3e606d1508..8cfc76410b 100644
--- a/unit_test/batched/sparse/Test_Batched_TeamCG.hpp
+++ b/unit_test/batched/sparse/Test_Batched_TeamCG.hpp
@@ -14,7 +14,7 @@ namespace Test {
 namespace TeamCG {
 
 template <typename DeviceType, typename ValuesViewType, typename IntView,
-          typename VectorViewType>
+          typename VectorViewType, typename KrylovHandleType>
 struct Functor_TestBatchedTeamCG {
   const ValuesViewType _D;
   const IntView _r;
@@ -22,13 +22,18 @@ struct Functor_TestBatchedTeamCG {
   const VectorViewType _X;
   const VectorViewType _B;
   const int _N_team;
-  KrylovHandle<typename ValuesViewType::value_type> handle;
+  KrylovHandleType handle;
 
-  KOKKOS_INLINE_FUNCTION
   Functor_TestBatchedTeamCG(const ValuesViewType &D, const IntView &r,
                             const IntView &c, const VectorViewType &X,
                             const VectorViewType &B, const int N_team)
-      : _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team) {}
+      : _D(D),
+        _r(r),
+        _c(c),
+        _X(X),
+        _B(B),
+        _N_team(N_team),
+        handle(KrylovHandleType(_D.extent(0), _N_team)) {}
 
   template <typename MemberType>
   KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
@@ -50,9 +55,7 @@ struct Functor_TestBatchedTeamCG {
 
     Operator A(d, _r, _c);
 
-    KokkosBatched::TeamCG<MemberType>::template invoke<Operator,
-                                                       VectorViewType>(
-        member, A, b, x, handle);
+    KokkosBatched::TeamCG<MemberType>::invoke(member, A, b, x, handle);
   }
 
   inline void run() {
@@ -96,6 +99,13 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) {
       typename Kokkos::Details::ArithTraits<ScalarType>::mag_type;
   using NormViewType = Kokkos::View<MagnitudeType *, Layout, EXSP>;
 
+  using Norm2DViewType   = Kokkos::View<MagnitudeType **, Layout, EXSP>;
+  using Scalar3DViewType = Kokkos::View<ScalarType ***, Layout, EXSP>;
+  using IntViewType      = Kokkos::View<int *, Layout, EXSP>;
+
+  using KrylovHandleType =
+      KrylovHandle<Norm2DViewType, IntViewType, Scalar3DViewType>;
+
   NormViewType sqr_norm_0("sqr_norm_0", N);
   NormViewType sqr_norm_j("sqr_norm_j", N);
 
@@ -127,8 +137,8 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) {
       1>(-1, D_host, r_host, c_host, X_host, 1, R_host);
   KokkosBatched::SerialDot<Trans::NoTranspose>::invoke(R_host, R_host,
                                                        sqr_norm_0_host);
-  Functor_TestBatchedTeamCG<DeviceType, ValuesViewType, IntView,
-                            VectorViewType>(D, r, c, X, B, N_team)
+  Functor_TestBatchedTeamCG<DeviceType, ValuesViewType, IntView, VectorViewType,
+                            KrylovHandleType>(D, r, c, X, B, N_team)
       .run();
 
   Kokkos::fence();
diff --git a/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp b/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp
index f724553590..553d4d3419 100644
--- a/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp
+++ b/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp
@@ -15,21 +15,30 @@ namespace Test {
 namespace TeamGMRES {
 
 template <typename DeviceType, typename ValuesViewType, typename IntView,
-          typename VectorViewType>
+          typename VectorViewType, typename KrylovHandleType>
 struct Functor_TestBatchedTeamGMRES {
   const ValuesViewType _D;
   const IntView _r;
   const IntView _c;
   const VectorViewType _X;
   const VectorViewType _B;
+  const VectorViewType _Diag;
   const int _N_team;
-  KrylovHandle<typename ValuesViewType::value_type> handle;
+  KrylovHandleType _handle;
 
-  KOKKOS_INLINE_FUNCTION
   Functor_TestBatchedTeamGMRES(const ValuesViewType &D, const IntView &r,
                                const IntView &c, const VectorViewType &X,
-                               const VectorViewType &B, const int N_team)
-      : _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team) {}
+                               const VectorViewType &B,
+                               const VectorViewType &diag, const int N_team,
+                               KrylovHandleType &handle)
+      : _D(D),
+        _r(r),
+        _c(c),
+        _X(X),
+        _B(B),
+        _Diag(diag),
+        _N_team(N_team),
+        _handle(handle) {}
 
   template <typename MemberType>
   KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
@@ -42,18 +51,23 @@ struct Functor_TestBatchedTeamGMRES {
 
     auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix),
                              Kokkos::ALL);
+    auto diag = Kokkos::subview(
+        _Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL);
     auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix),
                              Kokkos::ALL);
     auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix),
                              Kokkos::ALL);
 
-    using Operator = KokkosBatched::CrsMatrix<ValuesViewType, IntView>;
+    using Operator     = KokkosBatched::CrsMatrix<ValuesViewType, IntView>;
+    using PrecOperator = KokkosBatched::JacobiPrec<ValuesViewType>;
 
     Operator A(d, _r, _c);
+    PrecOperator P(diag);
+    P.setComputedInverse();
 
     KokkosBatched::TeamGMRES<MemberType>::template invoke<Operator,
                                                           VectorViewType>(
-        member, A, b, x, handle);
+        member, A, b, x, P, _handle);
   }
 
   inline void run() {
@@ -63,20 +77,37 @@ struct Functor_TestBatchedTeamGMRES {
     std::string name                  = name_region + name_value_type;
     Kokkos::Profiling::pushRegion(name.c_str());
     Kokkos::TeamPolicy<DeviceType> policy(_D.extent(0) / _N_team,
-                                          Kokkos::AUTO());
+                                          Kokkos::AUTO(), Kokkos::AUTO());
+
+    const int N                 = _D.extent(0);
+    const int n                 = _X.extent(1);
+    const int maximum_iteration = _handle.get_max_iteration();
+
+    _handle.set_ortho_strategy(0);
+    _handle.set_compute_last_residual(false);
+    _handle.set_tolerance(1e-8);
+
+    _handle.Arnoldi_view = typename KrylovHandleType::ArnoldiViewType(
+        "", N, maximum_iteration, n + maximum_iteration + 3);
 
-    size_t bytes_0 = ValuesViewType::shmem_size(_N_team, _X.extent(1));
-    size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1);
+    using ScalarType = typename ValuesViewType::non_const_value_type;
+    using Layout     = typename ValuesViewType::array_layout;
+    using EXSP       = typename ValuesViewType::execution_space;
 
-    handle.set_max_iteration(10);
+    using ViewType2D = Kokkos::View<ScalarType **, Layout, EXSP>;
 
-    int maximum_iteration = handle.get_max_iteration();
+    size_t bytes_1D   = ViewType2D::shmem_size(_N_team, 1);
+    size_t bytes_2D_1 = ViewType2D::shmem_size(_N_team, _X.extent(1));
+    size_t bytes_2D_2 = ViewType2D::shmem_size(_N_team, maximum_iteration + 1);
 
-    policy.set_scratch_size(0, Kokkos::PerTeam(5 * bytes_0 + 5 * bytes_1));
+    size_t bytes_row_ptr = IntView::shmem_size(_r.extent(0));
+    size_t bytes_col_idc = IntView::shmem_size(_c.extent(0));
+
+    size_t bytes_int  = bytes_row_ptr + bytes_col_idc;
+    size_t bytes_diag = bytes_2D_1;
+    size_t bytes_tmp  = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2;
     policy.set_scratch_size(
-        1, Kokkos::PerTeam(maximum_iteration * bytes_0 +
-                           ((maximum_iteration + 3) * maximum_iteration) *
-                               bytes_1));
+        0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int));
 
     Kokkos::parallel_for(name.c_str(), policy, *this);
     Kokkos::Profiling::popRegion();
@@ -95,6 +126,7 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) {
   VectorViewType R("r0", N, BlkSize);
   VectorViewType B("b", N, BlkSize);
   ValuesViewType D("D", N, nnz);
+  ValuesViewType Diag("Diag", N, BlkSize);
   IntView r("r", BlkSize + 1);
   IntView c("c", nnz);
 
@@ -106,11 +138,41 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) {
       typename Kokkos::Details::ArithTraits<ScalarType>::mag_type;
   using NormViewType = Kokkos::View<MagnitudeType *, Layout, EXSP>;
 
+  using Norm2DViewType   = Kokkos::View<MagnitudeType **, Layout, EXSP>;
+  using Scalar3DViewType = Kokkos::View<ScalarType ***, Layout, EXSP>;
+  using IntViewType      = Kokkos::View<int *, Layout, EXSP>;
+
+  using KrylovHandleType =
+      KrylovHandle<Norm2DViewType, IntViewType, Scalar3DViewType>;
+
   NormViewType sqr_norm_0("sqr_norm_0", N);
   NormViewType sqr_norm_j("sqr_norm_j", N);
 
   create_tridiagonal_batched_matrices(nnz, BlkSize, N, r, c, D, X, B);
 
+  {
+    auto diag_values_host = Kokkos::create_mirror_view(Diag);
+    auto values_host      = Kokkos::create_mirror_view(D);
+    auto row_ptr_host     = Kokkos::create_mirror_view(r);
+    auto colIndices_host  = Kokkos::create_mirror_view(c);
+
+    Kokkos::deep_copy(values_host, D);
+    Kokkos::deep_copy(row_ptr_host, r);
+    Kokkos::deep_copy(colIndices_host, c);
+
+    int current_index;
+    for (int i = 0; i < BlkSize; ++i) {
+      for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1);
+           ++current_index) {
+        if (colIndices_host(current_index) == i) break;
+      }
+      for (int j = 0; j < N; ++j)
+        diag_values_host(j, i) = values_host(j, current_index);
+    }
+
+    Kokkos::deep_copy(Diag, diag_values_host);
+  }
+
   // Compute initial norm
 
   Kokkos::deep_copy(R, B);
@@ -131,6 +193,9 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) {
   Kokkos::deep_copy(r_host, r);
   Kokkos::deep_copy(D_host, D);
 
+  const int n_iterations = 10;
+  KrylovHandleType handle(N, N_team, n_iterations);
+
   KokkosBatched::SerialSpmv<Trans::NoTranspose>::template invoke<
       typename ValuesViewType::HostMirror, typename IntView::HostMirror,
       typename VectorViewType::HostMirror, typename VectorViewType::HostMirror,
@@ -138,7 +203,8 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) {
   KokkosBatched::SerialDot<Trans::NoTranspose>::invoke(R_host, R_host,
                                                        sqr_norm_0_host);
   Functor_TestBatchedTeamGMRES<DeviceType, ValuesViewType, IntView,
-                               VectorViewType>(D, r, c, X, B, N_team)
+                               VectorViewType, KrylovHandleType>(
+      D, r, c, X, B, Diag, N_team, handle)
       .run();
 
   Kokkos::fence();
diff --git a/unit_test/batched/sparse/Test_Batched_TeamVectorCG.hpp b/unit_test/batched/sparse/Test_Batched_TeamVectorCG.hpp
index 6637d9858d..d9fb350726 100644
--- a/unit_test/batched/sparse/Test_Batched_TeamVectorCG.hpp
+++ b/unit_test/batched/sparse/Test_Batched_TeamVectorCG.hpp
@@ -14,7 +14,7 @@ namespace Test {
 namespace TeamVectorCG {
 
 template <typename DeviceType, typename ValuesViewType, typename IntView,
-          typename VectorViewType>
+          typename VectorViewType, typename KrylovHandleType>
 struct Functor_TestBatchedTeamVectorCG {
   const ValuesViewType _D;
   const IntView _r;
@@ -22,13 +22,18 @@ struct Functor_TestBatchedTeamVectorCG {
   const VectorViewType _X;
   const VectorViewType _B;
   const int _N_team;
-  KrylovHandle<typename ValuesViewType::value_type> handle;
+  KrylovHandleType handle;
 
-  KOKKOS_INLINE_FUNCTION
   Functor_TestBatchedTeamVectorCG(const ValuesViewType &D, const IntView &r,
                                   const IntView &c, const VectorViewType &X,
                                   const VectorViewType &B, const int N_team)
-      : _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team) {}
+      : _D(D),
+        _r(r),
+        _c(c),
+        _X(X),
+        _B(B),
+        _N_team(N_team),
+        handle(KrylovHandleType(_D.extent(0), _N_team)) {}
 
   template <typename MemberType>
   KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
@@ -96,6 +101,13 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) {
       typename Kokkos::Details::ArithTraits<ScalarType>::mag_type;
   using NormViewType = Kokkos::View<MagnitudeType *, Layout, EXSP>;
 
+  using Norm2DViewType   = Kokkos::View<MagnitudeType **, Layout, EXSP>;
+  using Scalar3DViewType = Kokkos::View<ScalarType ***, Layout, EXSP>;
+  using IntViewType      = Kokkos::View<int *, Layout, EXSP>;
+
+  using KrylovHandleType =
+      KrylovHandle<Norm2DViewType, IntViewType, Scalar3DViewType>;
+
   NormViewType sqr_norm_0("sqr_norm_0", N);
   NormViewType sqr_norm_j("sqr_norm_j", N);
 
@@ -128,7 +140,8 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) {
   KokkosBatched::SerialDot<Trans::NoTranspose>::invoke(R_host, R_host,
                                                        sqr_norm_0_host);
   Functor_TestBatchedTeamVectorCG<DeviceType, ValuesViewType, IntView,
-                                  VectorViewType>(D, r, c, X, B, N_team)
+                                  VectorViewType, KrylovHandleType>(D, r, c, X,
+                                                                    B, N_team)
       .run();
 
   Kokkos::fence();
diff --git a/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp b/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp
index 87e9da0281..17f72c8963 100644
--- a/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp
+++ b/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp
@@ -15,7 +15,7 @@ namespace Test {
 namespace TeamVectorGMRES {
 
 template <typename DeviceType, typename ValuesViewType, typename IntView,
-          typename VectorViewType>
+          typename VectorViewType, typename KrylovHandleType>
 struct Functor_TestBatchedTeamVectorGMRES {
   const ValuesViewType _D;
   const IntView _r;
@@ -24,15 +24,21 @@ struct Functor_TestBatchedTeamVectorGMRES {
   const VectorViewType _B;
   const VectorViewType _Diag;
   const int _N_team;
-  KrylovHandle<typename ValuesViewType::value_type> handle;
+  KrylovHandleType _handle;
 
-  KOKKOS_INLINE_FUNCTION
   Functor_TestBatchedTeamVectorGMRES(const ValuesViewType &D, const IntView &r,
                                      const IntView &c, const VectorViewType &X,
                                      const VectorViewType &B,
                                      const VectorViewType &diag,
-                                     const int N_team)
-      : _D(D), _r(r), _c(c), _X(X), _B(B), _Diag(diag), _N_team(N_team) {}
+                                     const int N_team, KrylovHandleType &handle)
+      : _D(D),
+        _r(r),
+        _c(c),
+        _X(X),
+        _B(B),
+        _Diag(diag),
+        _N_team(N_team),
+        _handle(handle) {}
 
   template <typename MemberType>
   KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
@@ -57,10 +63,11 @@ struct Functor_TestBatchedTeamVectorGMRES {
 
     Operator A(d, _r, _c);
     PrecOperator P(diag);
+    P.setComputedInverse();
 
     KokkosBatched::TeamVectorGMRES<MemberType>::template invoke<Operator,
                                                                 VectorViewType>(
-        member, A, b, x, P, handle);
+        member, A, b, x, P, _handle);
   }
 
   inline void run() {
@@ -72,18 +79,35 @@ struct Functor_TestBatchedTeamVectorGMRES {
     Kokkos::TeamPolicy<DeviceType> policy(_D.extent(0) / _N_team,
                                           Kokkos::AUTO(), Kokkos::AUTO());
 
-    size_t bytes_0 = ValuesViewType::shmem_size(_N_team, _X.extent(1));
-    size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1);
+    const int N                 = _D.extent(0);
+    const int n                 = _X.extent(1);
+    const int maximum_iteration = _handle.get_max_iteration();
+
+    _handle.set_ortho_strategy(0);
+    _handle.set_compute_last_residual(false);
+    _handle.set_tolerance(1e-8);
+
+    _handle.Arnoldi_view = typename KrylovHandleType::ArnoldiViewType(
+        "", N, maximum_iteration, n + maximum_iteration + 3);
+
+    using ScalarType = typename ValuesViewType::non_const_value_type;
+    using Layout     = typename ValuesViewType::array_layout;
+    using EXSP       = typename ValuesViewType::execution_space;
 
-    handle.set_max_iteration(10);
+    using ViewType2D = Kokkos::View<ScalarType **, Layout, EXSP>;
 
-    int maximum_iteration = handle.get_max_iteration();
+    size_t bytes_1D   = ViewType2D::shmem_size(_N_team, 1);
+    size_t bytes_2D_1 = ViewType2D::shmem_size(_N_team, _X.extent(1));
+    size_t bytes_2D_2 = ViewType2D::shmem_size(_N_team, maximum_iteration + 1);
 
-    policy.set_scratch_size(0, Kokkos::PerTeam(5 * bytes_0 + 5 * bytes_1));
+    size_t bytes_row_ptr = IntView::shmem_size(_r.extent(0));
+    size_t bytes_col_idc = IntView::shmem_size(_c.extent(0));
+
+    size_t bytes_int  = bytes_row_ptr + bytes_col_idc;
+    size_t bytes_diag = bytes_2D_1;
+    size_t bytes_tmp  = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2;
     policy.set_scratch_size(
-        1, Kokkos::PerTeam(maximum_iteration * bytes_0 +
-                           ((maximum_iteration + 3) * maximum_iteration) *
-                               bytes_1));
+        0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int));
 
     Kokkos::parallel_for(name.c_str(), policy, *this);
     Kokkos::Profiling::popRegion();
@@ -114,6 +138,13 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) {
       typename Kokkos::Details::ArithTraits<ScalarType>::mag_type;
   using NormViewType = Kokkos::View<MagnitudeType *, Layout, EXSP>;
 
+  using Norm2DViewType   = Kokkos::View<MagnitudeType **, Layout, EXSP>;
+  using Scalar3DViewType = Kokkos::View<ScalarType ***, Layout, EXSP>;
+  using IntViewType      = Kokkos::View<int *, Layout, EXSP>;
+
+  using KrylovHandleType =
+      KrylovHandle<Norm2DViewType, IntViewType, Scalar3DViewType>;
+
   NormViewType sqr_norm_0("sqr_norm_0", N);
   NormViewType sqr_norm_j("sqr_norm_j", N);
 
@@ -162,6 +193,9 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) {
   Kokkos::deep_copy(r_host, r);
   Kokkos::deep_copy(D_host, D);
 
+  const int n_iterations = 10;
+  KrylovHandleType handle(N, N_team, n_iterations);
+
   KokkosBatched::SerialSpmv<Trans::NoTranspose>::template invoke<
       typename ValuesViewType::HostMirror, typename IntView::HostMirror,
       typename VectorViewType::HostMirror, typename VectorViewType::HostMirror,
@@ -169,8 +203,8 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) {
   KokkosBatched::SerialDot<Trans::NoTranspose>::invoke(R_host, R_host,
                                                        sqr_norm_0_host);
   Functor_TestBatchedTeamVectorGMRES<DeviceType, ValuesViewType, IntView,
-                                     VectorViewType>(D, r, c, X, B, Diag,
-                                                     N_team)
+                                     VectorViewType, KrylovHandleType>(
+      D, r, c, X, B, Diag, N_team, handle)
       .run();
 
   Kokkos::fence();
diff --git a/unit_test/blas/Test_Blas.hpp b/unit_test/blas/Test_Blas.hpp
index 16d54e3dce..c607e74ca8 100644
--- a/unit_test/blas/Test_Blas.hpp
+++ b/unit_test/blas/Test_Blas.hpp
@@ -23,7 +23,13 @@
 #include "Test_Blas1_sum.hpp"
 #include "Test_Blas1_update.hpp"
 
+// Serial Blas 1
+#include "Test_Blas1_serial_setscal.hpp"
+#include "Test_Blas_serial_axpy.hpp"
+#include "Test_Blas_serial_nrm2.hpp"
+
 // Team Blas 1
+#include "Test_Blas1_team_setscal.hpp"
 #include "Test_Blas1_team_abs.hpp"
 #include "Test_Blas1_team_axpby.hpp"
 #include "Test_Blas1_team_axpy.hpp"
@@ -44,6 +50,9 @@
 #include "Test_Blas3_trmm.hpp"
 #include "Test_Blas3_trsm.hpp"
 
+// Stuff that should move later on
+#include "Test_Blas_Newton.hpp"
+
 // TPLs
 #include "Test_Blas_rocblas.hpp"
 
diff --git a/unit_test/blas/Test_Blas1_dot.hpp b/unit_test/blas/Test_Blas1_dot.hpp
index b2e3f95628..83dfd6048c 100644
--- a/unit_test/blas/Test_Blas1_dot.hpp
+++ b/unit_test/blas/Test_Blas1_dot.hpp
@@ -111,6 +111,7 @@ void impl_test_dot_mv(int N, int K) {
   Kokkos::View<ScalarB*, Kokkos::HostSpace> r("Dot::Result", K);
 
   KokkosBlas::dot(r, a, b);
+  Kokkos::fence();
   for (int k = 0; k < K; k++) {
     ScalarA nonconst_nonconst_result = r(k);
     EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k],
@@ -118,6 +119,7 @@ void impl_test_dot_mv(int N, int K) {
   }
 
   KokkosBlas::dot(r, c_a, c_b);
+  Kokkos::fence();
   for (int k = 0; k < K; k++) {
     ScalarA const_const_result = r(k);
     EXPECT_NEAR_KK(const_const_result, expected_result[k],
@@ -125,6 +127,7 @@ void impl_test_dot_mv(int N, int K) {
   }
 
   KokkosBlas::dot(r, a, c_b);
+  Kokkos::fence();
   for (int k = 0; k < K; k++) {
     ScalarA non_const_const_result = r(k);
     EXPECT_NEAR_KK(non_const_const_result, expected_result[k],
@@ -132,6 +135,7 @@ void impl_test_dot_mv(int N, int K) {
   }
 
   KokkosBlas::dot(r, c_a, b);
+  Kokkos::fence();
   for (int k = 0; k < K; k++) {
     ScalarA const_non_const_result = r(k);
     EXPECT_NEAR_KK(const_non_const_result, expected_result[k],
diff --git a/unit_test/blas/Test_Blas1_iamax.hpp b/unit_test/blas/Test_Blas1_iamax.hpp
index 88c21be83c..82f1fc1c76 100644
--- a/unit_test/blas/Test_Blas1_iamax.hpp
+++ b/unit_test/blas/Test_Blas1_iamax.hpp
@@ -61,6 +61,7 @@ void impl_test_iamax(int N) {
     ViewType0D r("Iamax::Result 0-D View on host");
 
     KokkosBlas::iamax(r, a);
+    Kokkos::fence();
     size_type nonconst_max_loc = r();
     ASSERT_EQ(nonconst_max_loc, expected_max_loc);
 
@@ -151,6 +152,7 @@ void impl_test_iamax_mv(int N, int K) {
         r("Iamax::Result View on host", K);
 
     KokkosBlas::iamax(r, a);
+    Kokkos::fence();
 
     for (int k = 0; k < K; k++) {
       size_type nonconst_result = r(k);
@@ -159,6 +161,7 @@ void impl_test_iamax_mv(int N, int K) {
     }
 
     KokkosBlas::iamax(r, c_a);
+    Kokkos::fence();
 
     for (int k = 0; k < K; k++) {
       size_type const_result = r(k);
diff --git a/unit_test/blas/Test_Blas1_nrm1.hpp b/unit_test/blas/Test_Blas1_nrm1.hpp
index c68492b6dd..1c476cbf43 100644
--- a/unit_test/blas/Test_Blas1_nrm1.hpp
+++ b/unit_test/blas/Test_Blas1_nrm1.hpp
@@ -98,6 +98,7 @@ void impl_test_nrm1_mv(int N, int K) {
 
   KokkosBlas::nrm1(r, a);
   KokkosBlas::nrm1(c_r, a);
+  Kokkos::fence();
   for (int k = 0; k < K; k++) {
     EXPECT_NEAR_KK(r(k), expected_result(k), eps * expected_result(k));
     EXPECT_NEAR_KK(c_r(k), expected_result(k), eps * expected_result(k));
diff --git a/unit_test/blas/Test_Blas1_nrm2.hpp b/unit_test/blas/Test_Blas1_nrm2.hpp
index 688035f842..c568b12564 100644
--- a/unit_test/blas/Test_Blas1_nrm2.hpp
+++ b/unit_test/blas/Test_Blas1_nrm2.hpp
@@ -84,6 +84,7 @@ void impl_test_nrm2_mv(int N, int K) {
   Kokkos::View<typename AT::mag_type*, Kokkos::HostSpace> r("Dot::Result", K);
 
   KokkosBlas::nrm2(r, a);
+  Kokkos::fence();
   for (int k = 0; k < K; k++) {
     typename AT::mag_type nonconst_result = r(k);
     EXPECT_NEAR_KK(nonconst_result, expected_result[k],
@@ -91,6 +92,7 @@ void impl_test_nrm2_mv(int N, int K) {
   }
 
   KokkosBlas::nrm2(r, c_a);
+  Kokkos::fence();
   for (int k = 0; k < K; k++) {
     typename AT::mag_type const_result = r(k);
     EXPECT_NEAR_KK(const_result, expected_result[k], eps * expected_result[k]);
diff --git a/unit_test/blas/Test_Blas1_nrm2_squared.hpp b/unit_test/blas/Test_Blas1_nrm2_squared.hpp
index 317b9b543b..98c2cf7e8f 100644
--- a/unit_test/blas/Test_Blas1_nrm2_squared.hpp
+++ b/unit_test/blas/Test_Blas1_nrm2_squared.hpp
@@ -93,6 +93,7 @@ void impl_test_nrm2_squared_mv(int N, int K) {
   Kokkos::View<typename AT::mag_type*, Kokkos::HostSpace> r("Dot::Result", K);
 
   KokkosBlas::nrm2_squared(r, a);
+  Kokkos::fence();
   for (int k = 0; k < K; k++) {
     typename AT::mag_type nonconst_result = r(k);
     typename AT::mag_type divisor =
@@ -103,6 +104,7 @@ void impl_test_nrm2_squared_mv(int N, int K) {
   }
 
   KokkosBlas::nrm2_squared(r, c_a);
+  Kokkos::fence();
   for (int k = 0; k < K; k++) {
     typename AT::mag_type const_result = r(k);
     typename AT::mag_type divisor =
diff --git a/unit_test/blas/Test_Blas1_serial_setscal.hpp b/unit_test/blas/Test_Blas1_serial_setscal.hpp
new file mode 100644
index 0000000000..2e2a207c47
--- /dev/null
+++ b/unit_test/blas/Test_Blas1_serial_setscal.hpp
@@ -0,0 +1,246 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+
+#include "KokkosBlas1_set.hpp"
+#include "KokkosBlas1_scal.hpp"
+
+#include "KokkosKernels_TestUtils.hpp"
+
+using namespace KokkosBlas;
+
+namespace Test {
+
+enum : int { BlasSet = 0, BlasScale = 1 };
+
+struct KokkosKernelTag {};
+struct NaiveTag {};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename AlgoTagType, int TestID>
+struct Functor_TestBlasSerialMatUtil {
+  ScalarType _alpha;
+  ViewType _a;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBlasSerialMatUtil(const ScalarType alpha, const ViewType &a)
+      : _alpha(alpha), _a(a) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const KokkosKernelTag &, const int i) const {
+    auto A = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL());
+    switch (TestID) {
+      case BlasSet: KokkosBlas::SerialSet::invoke(_alpha, A); break;
+      case BlasScale: KokkosBlas::SerialScale::invoke(_alpha, A); break;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const NaiveTag &, const int k) const {
+    // MD Note: changing because of the error with -werror
+    auto A      = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    const int m = A.extent(0), n = A.extent(1);
+    switch (TestID) {
+      case BlasSet: {
+        for (int i = 0; i < m; ++i)
+          for (int j = 0; j < n; ++j) A(i, j) = _alpha;
+        break;
+      }
+      case BlasScale: {
+        for (int i = 0; i < m; ++i)
+          for (int j = 0; j < n; ++j) A(i, j) *= _alpha;
+        break;
+      }
+    }
+  }
+
+  inline int run() {
+    typedef typename ViewType::value_type value_type;
+    std::string name_region("KokkosBlas::Test::SerialMatUtil");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name_work_tag =
+        (std::is_same<AlgoTagType, KokkosKernelTag>::value
+             ? "::KokkosBlas"
+             : std::is_same<AlgoTagType, NaiveTag>::value ? "::Naive"
+                                                          : "::UnknownWorkTag");
+    std::string name_test_id =
+        (TestID == BlasSet ? "Set"
+                           : TestID == BlasScale ? "Scale" : "UnknownTest");
+    std::string name =
+        name_region + name_value_type + name_work_tag + name_test_id;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType, AlgoTagType> policy(0, _a.extent(0));
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+    return 0;
+  }
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          int TestID>
+void impl_test_blas_matutil(const int N, const int BlkSize) {
+  /// typedefs
+  typedef typename ViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  /// radomized input testing views
+  const ScalarType alpha = 11.1;
+  ViewType a("a", N, BlkSize, BlkSize);
+  ViewType b("b", N, BlkSize, BlkSize);
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(a, random, value_type(1.0));
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(b, a);
+
+  /// test body
+  Functor_TestBlasSerialMatUtil<DeviceType, ViewType, ScalarType, NaiveTag,
+                                TestID>(alpha, a)
+      .run();
+  Functor_TestBlasSerialMatUtil<DeviceType, ViewType, ScalarType,
+                                KokkosKernelTag, TestID>(alpha, b)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  typename ViewType::HostMirror a_host = Kokkos::create_mirror_view(a);
+  typename ViewType::HostMirror b_host = Kokkos::create_mirror_view(b);
+
+  Kokkos::deep_copy(a_host, a);
+  Kokkos::deep_copy(b_host, b);
+
+  /// check a = b
+  typename ats::mag_type eps =
+      100 * std::numeric_limits<typename ats::mag_type>::epsilon();
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < BlkSize; ++i)
+      for (int j = 0; j < BlkSize; ++j)
+        EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps);
+}
+}  // namespace Test
+
+template <typename DeviceType, typename ValueType, typename ScalarType,
+          int TestID>
+int test_blas_matutil() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        ViewType;
+    Test::impl_test_blas_matutil<DeviceType, ViewType, ScalarType, TestID>(0,
+                                                                           10);
+    Test::impl_test_blas_matutil<DeviceType, ViewType, ScalarType, TestID>(10,
+                                                                           15);
+    Test::impl_test_blas_matutil<DeviceType, ViewType, ScalarType, TestID>(1024,
+                                                                           9);
+    Test::impl_test_blas_matutil<DeviceType, ViewType, ScalarType, TestID>(
+        132231, 3);
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    Test::impl_test_blas_matutil<DeviceType, ViewType, ScalarType, TestID>(0,
+                                                                           10);
+    Test::impl_test_blas_matutil<DeviceType, ViewType, ScalarType, TestID>(10,
+                                                                           15);
+    Test::impl_test_blas_matutil<DeviceType, ViewType, ScalarType, TestID>(1024,
+                                                                           9);
+    Test::impl_test_blas_matutil<DeviceType, ViewType, ScalarType, TestID>(
+        132231, 3);
+  }
+#endif
+
+  return 0;
+}
+
+// Real test cases
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, blas_scalar_serial_set_float_float) {
+  test_blas_matutil<TestExecSpace, float, float, ::Test::BlasSet>();
+}
+TEST_F(TestCategory, blas_scalar_serial_scale_float_float) {
+  test_blas_matutil<TestExecSpace, float, float, ::Test::BlasScale>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, blas_scalar_serial_set_double_double) {
+  test_blas_matutil<TestExecSpace, double, double, ::Test::BlasSet>();
+}
+TEST_F(TestCategory, blas_scalar_serial_scale_double_double) {
+  test_blas_matutil<TestExecSpace, double, double, ::Test::BlasScale>();
+}
+#endif
+
+// Complex test cases
+
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
+TEST_F(TestCategory, blas_scalar_serial_set_dcomplex_dcomplex) {
+  test_blas_matutil<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, ::Test::BlasSet>();
+}
+TEST_F(TestCategory, blas_scalar_serial_scale_dcomplex_dcomplex) {
+  test_blas_matutil<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, ::Test::BlasScale>();
+}
+TEST_F(TestCategory, blas_scalar_serial_set_dcomplex_double) {
+  test_blas_matutil<TestExecSpace, Kokkos::complex<double>, double,
+                    ::Test::BlasSet>();
+}
+TEST_F(TestCategory, blas_scalar_serial_scale_dcomplex_double) {
+  test_blas_matutil<TestExecSpace, Kokkos::complex<double>, double,
+                    ::Test::BlasScale>();
+}
+#endif
diff --git a/unit_test/blas/Test_Blas1_sum.hpp b/unit_test/blas/Test_Blas1_sum.hpp
index 2b7f51370e..5ad2ef038b 100644
--- a/unit_test/blas/Test_Blas1_sum.hpp
+++ b/unit_test/blas/Test_Blas1_sum.hpp
@@ -73,6 +73,7 @@ void impl_test_sum_mv(int N, int K) {
   Kokkos::View<ScalarA*, Kokkos::HostSpace> r("Sum::Result", K);
 
   KokkosBlas::sum(r, a);
+  Kokkos::fence();
   for (int k = 0; k < K; k++) {
     ScalarA nonconst_result = r(k);
     EXPECT_NEAR_KK(nonconst_result, expected_result[k],
@@ -80,6 +81,7 @@ void impl_test_sum_mv(int N, int K) {
   }
 
   KokkosBlas::sum(r, c_a);
+  Kokkos::fence();
   for (int k = 0; k < K; k++) {
     ScalarA const_result = r(k);
     EXPECT_NEAR_KK(const_result, expected_result[k], eps * expected_result[k]);
diff --git a/unit_test/blas/Test_Blas1_team_setscal.hpp b/unit_test/blas/Test_Blas1_team_setscal.hpp
new file mode 100644
index 0000000000..394c7b6c2d
--- /dev/null
+++ b/unit_test/blas/Test_Blas1_team_setscal.hpp
@@ -0,0 +1,259 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+
+#include "KokkosBlas1_set.hpp"
+#include "KokkosBlas1_scal.hpp"
+
+#include "KokkosKernels_TestUtils.hpp"
+
+namespace Test {
+namespace TeamMatUtil {
+
+enum : int { BlasSet = 0, BlasScale = 1 };
+
+struct KokkosKernelTag {};
+struct NaiveTag {};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename AlgoTagType, int TestID>
+struct Functor_TestBlasTeamMatUtil {
+  ScalarType _alpha;
+  ViewType _a;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBlasTeamMatUtil(const ScalarType alpha, const ViewType &a)
+      : _alpha(alpha), _a(a) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const KokkosKernelTag &,
+                                         const MemberType &member) const {
+    const int i = member.league_rank();
+    auto A      = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL());
+    switch (TestID) {
+      case BlasSet:
+        KokkosBlas::TeamSet<MemberType>::invoke(member, _alpha, A);
+        break;
+      case BlasScale:
+        KokkosBlas::TeamScale<MemberType>::invoke(member, _alpha, A);
+        break;
+    }
+  }
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const NaiveTag &,
+                                         const MemberType &member) const {
+    if (member.team_rank() == 0) {
+      const int k = member.league_rank();
+      auto A      = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+      const int m = A.extent(0), n = A.extent(1);
+      switch (TestID) {
+        case BlasSet: {
+          for (int i = 0; i < m; ++i)
+            for (int j = 0; j < n; ++j) A(i, j) = _alpha;
+          break;
+        }
+        case BlasScale: {
+          for (int i = 0; i < m; ++i)
+            for (int j = 0; j < n; ++j) A(i, j) *= _alpha;
+          break;
+        }
+      }
+    }
+  }
+
+  inline int run() {
+    typedef typename ViewType::value_type value_type;
+    std::string name_region("KokkosBlas::Test::SerialMatUtil");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name_work_tag =
+        (std::is_same<AlgoTagType, KokkosKernelTag>::value
+             ? "::KokkosBlas"
+             : std::is_same<AlgoTagType, NaiveTag>::value ? "::Naive"
+                                                          : "::UnknownWorkTag");
+    std::string name_test_id =
+        (TestID == BlasSet ? "Set"
+                           : TestID == BlasScale ? "Scale" : "UnknownTest");
+    std::string name =
+        name_region + name_value_type + name_work_tag + name_test_id;
+    Kokkos::Profiling::pushRegion(name.c_str());
+
+    const int league_size = _a.extent(0);
+    Kokkos::TeamPolicy<DeviceType, AlgoTagType> policy(league_size,
+                                                       Kokkos::AUTO);
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+
+    return 0;
+  }
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          int TestID>
+void impl_test_blas_matutil(const int N, const int BlkSize) {
+  /// typedefs
+  typedef typename ViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  /// radomized input testing views
+  const ScalarType alpha = 11.1;
+  ViewType a("a", N, BlkSize, BlkSize);
+  ViewType b("b", N, BlkSize, BlkSize);
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(a, random, value_type(1.0));
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(b, a);
+
+  /// test body
+  Functor_TestBlasTeamMatUtil<DeviceType, ViewType, ScalarType, NaiveTag,
+                              TestID>(alpha, a)
+      .run();
+  Functor_TestBlasTeamMatUtil<DeviceType, ViewType, ScalarType, KokkosKernelTag,
+                              TestID>(alpha, b)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  typename ViewType::HostMirror a_host = Kokkos::create_mirror_view(a);
+  typename ViewType::HostMirror b_host = Kokkos::create_mirror_view(b);
+
+  Kokkos::deep_copy(a_host, a);
+  Kokkos::deep_copy(b_host, b);
+
+  /// check a = b
+  typename ats::mag_type eps =
+      100 * std::numeric_limits<typename ats::mag_type>::epsilon();
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < BlkSize; ++i)
+      for (int j = 0; j < BlkSize; ++j)
+        EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps);
+}
+}  // namespace TeamMatUtil
+}  // namespace Test
+
+template <typename DeviceType, typename ValueType, typename ScalarType,
+          int TestID>
+int test_blas_team_matutil() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        ViewType;
+    Test::TeamMatUtil::impl_test_blas_matutil<DeviceType, ViewType, ScalarType,
+                                              TestID>(0, 10);
+    Test::TeamMatUtil::impl_test_blas_matutil<DeviceType, ViewType, ScalarType,
+                                              TestID>(10, 15);
+    Test::TeamMatUtil::impl_test_blas_matutil<DeviceType, ViewType, ScalarType,
+                                              TestID>(1024, 9);
+    Test::TeamMatUtil::impl_test_blas_matutil<DeviceType, ViewType, ScalarType,
+                                              TestID>(132231, 3);
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    Test::TeamMatUtil::impl_test_blas_matutil<DeviceType, ViewType, ScalarType,
+                                              TestID>(0, 10);
+    Test::TeamMatUtil::impl_test_blas_matutil<DeviceType, ViewType, ScalarType,
+                                              TestID>(10, 15);
+    Test::TeamMatUtil::impl_test_blas_matutil<DeviceType, ViewType, ScalarType,
+                                              TestID>(1024, 9);
+    Test::TeamMatUtil::impl_test_blas_matutil<DeviceType, ViewType, ScalarType,
+                                              TestID>(132231, 3);
+  }
+#endif
+
+  return 0;
+}
+
+// Real test cases
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, blas_scalar_team_set_float_float) {
+  test_blas_team_matutil<TestExecSpace, float, float, ::Test::BlasSet>();
+}
+TEST_F(TestCategory, blas_scalar_team_scale_float_float) {
+  test_blas_team_matutil<TestExecSpace, float, float, ::Test::BlasScale>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, blas_scalar_team_set_double_double) {
+  test_blas_team_matutil<TestExecSpace, double, double, ::Test::BlasSet>();
+}
+TEST_F(TestCategory, blas_scalar_team_scale_double_double) {
+  test_blas_team_matutil<TestExecSpace, double, double, ::Test::BlasScale>();
+}
+#endif
+
+// Complex test cases
+
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
+TEST_F(TestCategory, blas_scalar_team_set_dcomplex_dcomplex) {
+  test_blas_team_matutil<TestExecSpace, Kokkos::complex<double>,
+                         Kokkos::complex<double>, ::Test::BlasSet>();
+}
+TEST_F(TestCategory, blas_scalar_team_scale_dcomplex_dcomplex) {
+  test_blas_team_matutil<TestExecSpace, Kokkos::complex<double>,
+                         Kokkos::complex<double>, ::Test::BlasScale>();
+}
+TEST_F(TestCategory, blas_scalar_team_set_dcomplex_double) {
+  test_blas_team_matutil<TestExecSpace, Kokkos::complex<double>, double,
+                         ::Test::BlasSet>();
+}
+TEST_F(TestCategory, blas_scalar_team_scale_dcomplex_double) {
+  test_blas_team_matutil<TestExecSpace, Kokkos::complex<double>, double,
+                         ::Test::BlasScale>();
+}
+#endif
diff --git a/unit_test/blas/Test_Blas_Newton.hpp b/unit_test/blas/Test_Blas_Newton.hpp
new file mode 100644
index 0000000000..600ba3e0b6
--- /dev/null
+++ b/unit_test/blas/Test_Blas_Newton.hpp
@@ -0,0 +1,187 @@
+#include <gtest/gtest.h>
+
+#include <KokkosBlas_Newton_impl.hpp>
+#include <KokkosKernels_TestUtils.hpp>
+
+namespace Test {
+
+// Logistic equation
+// dy/dt=y(1-y)
+//
+// solution y = 1/(1+exp(-t))
+// y(0)=0.5
+//
+// Using BDF1 to integrate:
+// y-y_n=dt*y*(1-y)
+//
+// Residual: r = y - y_n - dt*y*(1-y)
+// Jacobian: J = 1 - dt + 2*dt*y
+template <typename scalar_type, typename execution_space>
+struct LogisticEquation {
+  using vec_type = Kokkos::View<scalar_type*, execution_space>;
+  using mat_type = Kokkos::View<scalar_type**, execution_space>;
+
+  const int neqs = 1;
+  scalar_type dt;
+  vec_type state;
+
+  LogisticEquation(const scalar_type dt_, vec_type initial_state)
+      : dt(dt_), state(initial_state) {}
+
+  KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& dydt) const {
+    dydt(0) = y(0) - state(0) - dt * y(0) * (1 - y(0));
+  }
+
+  KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const {
+    jac(0, 0) = 1 - dt + 2 * dt * y(0);
+  }
+
+  KOKKOS_FUNCTION scalar_type expected_val(const scalar_type t) const {
+    using Kokkos::exp;
+
+    return static_cast<scalar_type>(1 / (1 + exp(-t)));
+  }
+
+  KOKKOS_FUNCTION int num_equations() const { return neqs; }
+};
+
+// Intersection of square and hyperbola
+// x^2 + y^2 = 20
+// x^2 - y^2 = -2
+//
+// solution: x = +/- 3
+//           y = +/- sqrt(11)
+//
+// Residual: r = [x^2 + y^2 - 20]
+//               [x^2 - y^2 +  2]
+// Jacobian: J = [2*x,  2*y]
+//               [2*x, -2*y]
+template <typename scalar_type, typename execution_space>
+struct Intersection {
+  using vec_type = Kokkos::View<scalar_type*, execution_space>;
+  using mat_type = Kokkos::View<scalar_type**, execution_space>;
+
+  const int neqs = 2;
+
+  Intersection() = default;
+
+  KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& dydt) const {
+    dydt(0) = y(0) * y(0) + y(1) * y(1) - 20;
+    dydt(1) = y(0) * y(0) - y(1) * y(1) + 2;
+  }
+
+  KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const {
+    jac(0, 0) = 2 * y(0);
+    jac(0, 1) = 2 * y(1);
+    jac(1, 0) = 2 * y(0);
+    jac(1, 1) = -2 * y(1);
+  }
+
+  KOKKOS_FUNCTION int num_equations() const { return neqs; }
+};
+
+template <class solver>
+struct NewtonWrapper {
+  solver newton_solver;
+
+  NewtonWrapper(solver newton_solver_) : newton_solver(newton_solver_){};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int /* system_index */) const { newton_solver.solve(); }
+};
+
+template <typename execution_space, typename scalar_type>
+int test_logistic() {
+  using vec_type    = typename Kokkos::View<scalar_type*, execution_space>;
+  using mat_type    = typename Kokkos::View<scalar_type**, execution_space>;
+  using norm_type   = typename Kokkos::View<scalar_type*, execution_space>;
+  using handle_type = KokkosBlas::Impl::NewtonHandle<norm_type>;
+  using system_type = LogisticEquation<scalar_type, execution_space>;
+  using newton_type =
+      KokkosBlas::Impl::NewtonFunctor<system_type, mat_type, vec_type, vec_type,
+                                      handle_type>;
+
+  // Create the non-linear system and initialize data
+  vec_type state("state", 1);
+  Kokkos::deep_copy(state, 0.5);
+  system_type ode(0.1, state);
+
+  vec_type x("solution vector", 1), rhs("right hand side vector", 1);
+  Kokkos::deep_copy(x, 0.5);
+
+  // Create the solver and wrapper
+  handle_type handle;
+  handle.debug_mode = false;
+  newton_type newton_solver(ode, x, rhs, handle);
+  NewtonWrapper<newton_type> wrapper(newton_solver);
+
+  // Launch the problem in a parallel_for
+  Kokkos::RangePolicy<execution_space> my_policy(0, 1);
+  Kokkos::parallel_for(my_policy, wrapper);
+
+  // Get the solution back and test it
+  auto x_h = Kokkos::create_mirror_view(x);
+  Kokkos::deep_copy(x_h, x);
+  printf("Non-linear problem solution:\n");
+  printf("  [%f]\n", x_h(0));
+
+  return 0;
+}
+
+template <typename execution_space, typename scalar_type>
+int test_intersection() {
+  using vec_type    = typename Kokkos::View<scalar_type*, execution_space>;
+  using mat_type    = typename Kokkos::View<scalar_type**, execution_space>;
+  using norm_type   = typename Kokkos::View<scalar_type*, execution_space>;
+  using handle_type = KokkosBlas::Impl::NewtonHandle<norm_type>;
+  using system_type = Intersection<scalar_type, execution_space>;
+  using newton_type =
+      KokkosBlas::Impl::NewtonFunctor<system_type, mat_type, vec_type, vec_type,
+                                      handle_type>;
+
+  // Create the non-linear system and initialize data
+  system_type intersection;
+  vec_type x("solution vector", 2), rhs("right hand side vector", 2);
+  {
+    typename vec_type::HostMirror x_h = Kokkos::create_mirror_view(x);
+    x_h(0)                            = 2.5;
+    x_h(1)                            = 3.0;
+    Kokkos::deep_copy(x, x_h);
+  }
+
+  // Create the solver and wrapper
+  handle_type handle;
+  handle.debug_mode = false;
+  newton_type newton_solver(intersection, x, rhs, handle);
+  NewtonWrapper<newton_type> wrapper(newton_solver);
+
+  // Launch the problem in a parallel_for
+  Kokkos::RangePolicy<execution_space> my_policy(0, 1);
+  Kokkos::parallel_for(my_policy, wrapper);
+
+  // Get the solution back and test it
+  auto x_h = Kokkos::create_mirror_view(x);
+  Kokkos::deep_copy(x_h, x);
+  printf("Non-linear problem solution:\n");
+  for (int idx = 0; idx < x_h.extent_int(0); ++idx) {
+    printf("  [%f]\n", x_h(idx));
+  }
+  EXPECT_NEAR_KK(x_h(0), 3.0, 3.0e-4);
+  EXPECT_NEAR_KK(x_h(1), 3.3166247903553998, 3.3166247903553998 * 1.0e-4);
+
+  return 0;
+}
+
+}  // namespace Test
+
+template <class scalar_type>
+int test_newton() {
+  Test::test_logistic<TestExecSpace, scalar_type>();
+  Test::test_intersection<TestExecSpace, scalar_type>();
+
+  return 1;
+}
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, newton_serial) { test_newton<double>(); }
+#endif
diff --git a/unit_test/blas/Test_Blas_serial_axpy.hpp b/unit_test/blas/Test_Blas_serial_axpy.hpp
new file mode 100644
index 0000000000..83892640a7
--- /dev/null
+++ b/unit_test/blas/Test_Blas_serial_axpy.hpp
@@ -0,0 +1,218 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Luc Berger-Vergiat (lberge@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef TEST_BLAS_SERIAL_AXPY_HPP_
+#define TEST_BLAS_SERIAL_AXPY_HPP_
+
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+#include "KokkosKernels_TestUtils.hpp"
+
+#include "KokkosBlas1_axpby.hpp"
+
+namespace Test {
+
+struct KokkosKernelAxpyTag {};
+struct NaiveAxpyTag {};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename AlgoTagType>
+struct Functor_TestBlasSerialAxpy {
+  ScalarType _alpha;
+  ViewType _x;
+  ViewType _y;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBlasSerialAxpy(const ScalarType alpha, const ViewType &x,
+                             const ViewType &y)
+      : _alpha(alpha), _x(x), _y(y) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const KokkosKernelAxpyTag &, const int i) const {
+    auto X = Kokkos::subview(_x, i, Kokkos::ALL(), Kokkos::ALL());
+    auto Y = Kokkos::subview(_y, i, Kokkos::ALL(), Kokkos::ALL());
+    KokkosBlas::serial_axpy(_alpha, X, Y);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const NaiveAxpyTag &, const int k) const {
+    auto X      = Kokkos::subview(_x, k, Kokkos::ALL(), Kokkos::ALL());
+    auto Y      = Kokkos::subview(_y, k, Kokkos::ALL(), Kokkos::ALL());
+    const int m = X.extent(0), n = X.extent(1);
+    for (int i = 0; i < m; ++i)
+      for (int j = 0; j < n; ++j) Y(i, j) += _alpha * X(i, j);
+  }
+
+  inline void run() {
+    using value_type = typename ViewType::value_type;
+    std::string name_region("KokkosBlas::Test::SerialAxpy");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name_work_tag =
+        (std::is_same<AlgoTagType, KokkosKernelAxpyTag>::value
+             ? "::KokkosBlas"
+             : std::is_same<AlgoTagType, NaiveAxpyTag>::value
+                   ? "::Naive"
+                   : "::UnknownWorkTag");
+    std::string name_test_id = "Axpy";
+    std::string name =
+        name_region + name_value_type + name_work_tag + name_test_id;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType, AlgoTagType> policy(0, _x.extent(0));
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+    return;
+  }
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType>
+void impl_test_blas_serial_axpy(const int N, const int BlkSize) {
+  /// typedefs
+  using value_type = typename ViewType::value_type;
+  using ats        = Kokkos::ArithTraits<value_type>;
+
+  /// radomized input testing views
+  const ScalarType alpha = 11.1;
+  ViewType X("X", N, BlkSize, BlkSize);
+  ViewType Y("Y", N, BlkSize, BlkSize);
+  ViewType Yref("Yref", N, BlkSize, BlkSize);
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(X, random, ats::one());
+  Kokkos::fill_random(Y, random, ats::one());
+  Kokkos::fence();
+  Kokkos::deep_copy(Yref, Y);
+
+  /// test body
+  Functor_TestBlasSerialAxpy<DeviceType, ViewType, ScalarType, NaiveAxpyTag>(
+      alpha, X, Yref)
+      .run();
+  Functor_TestBlasSerialAxpy<DeviceType, ViewType, ScalarType,
+                             KokkosKernelAxpyTag>(alpha, X, Y)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  typename ViewType::HostMirror Y_host    = Kokkos::create_mirror_view(Y);
+  typename ViewType::HostMirror Yref_host = Kokkos::create_mirror_view(Yref);
+
+  Kokkos::deep_copy(Y_host, Y);
+  Kokkos::deep_copy(Yref_host, Yref);
+
+  /// check a = b
+  typename ats::mag_type eps =
+      100 * std::numeric_limits<typename ats::mag_type>::epsilon();
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < BlkSize; ++i)
+      for (int j = 0; j < BlkSize; ++j)
+        EXPECT_NEAR_KK(Y_host(k, i, j), Yref_host(k, i, j), eps);
+}
+
+}  // namespace Test
+
+template <typename DeviceType, typename ValueType, typename ScalarType>
+int test_blas_serial_axpy() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        ViewType;
+    Test::impl_test_blas_serial_axpy<DeviceType, ViewType, ScalarType>(0, 10);
+    Test::impl_test_blas_serial_axpy<DeviceType, ViewType, ScalarType>(10, 15);
+    Test::impl_test_blas_serial_axpy<DeviceType, ViewType, ScalarType>(1024, 9);
+    Test::impl_test_blas_serial_axpy<DeviceType, ViewType, ScalarType>(132231,
+                                                                       3);
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    Test::impl_test_blas_serial_axpy<DeviceType, ViewType, ScalarType>(0, 10);
+    Test::impl_test_blas_serial_axpy<DeviceType, ViewType, ScalarType>(10, 15);
+    Test::impl_test_blas_serial_axpy<DeviceType, ViewType, ScalarType>(1024, 9);
+    Test::impl_test_blas_serial_axpy<DeviceType, ViewType, ScalarType>(132231,
+                                                                       3);
+  }
+#endif
+
+  return 0;
+}
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, serial_axpy_float_float) {
+  test_blas_serial_axpy<TestExecSpace, float, float>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, serial_axpy_double_double) {
+  test_blas_serial_axpy<TestExecSpace, double, double>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
+TEST_F(TestCategory, serial_axpy_dcomplex_dcomplex) {
+  test_blas_serial_axpy<TestExecSpace, Kokkos::complex<double>,
+                        Kokkos::complex<double> >();
+}
+
+TEST_F(TestCategory, serial_axpy_dcomplex_double) {
+  test_blas_serial_axpy<TestExecSpace, Kokkos::complex<double>, double>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT)
+TEST_F(TestCategory, serial_axpy_fcomplex_fcomplex) {
+  test_blas_serial_axpy<TestExecSpace, Kokkos::complex<float>,
+                        Kokkos::complex<double> >();
+}
+
+TEST_F(TestCategory, serial_axpy_fcomplex_float) {
+  test_blas_serial_axpy<TestExecSpace, Kokkos::complex<float>, float>();
+}
+#endif
+
+#endif  // TEST_BLAS_SERIAL_AXPY_HPP_
diff --git a/unit_test/blas/Test_Blas_serial_nrm2.hpp b/unit_test/blas/Test_Blas_serial_nrm2.hpp
new file mode 100644
index 0000000000..1a2721e782
--- /dev/null
+++ b/unit_test/blas/Test_Blas_serial_nrm2.hpp
@@ -0,0 +1,316 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Luc Berger-Vergiat (lberge@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef TEST_BLAS_SERIAL_NRM2_HPP_
+#define TEST_BLAS_SERIAL_NRM2_HPP_
+
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+#include "KokkosKernels_TestUtils.hpp"
+
+#include "KokkosBlas1_nrm2.hpp"
+
+namespace Test {
+
+template <typename DeviceType, typename ViewType, typename AlgoTagType>
+struct Functor_TestBlasSerialNrm2 {
+  using execution_space = typename DeviceType::execution_space;
+  using value_type      = typename ViewType::non_const_value_type;
+  using IPT             = Kokkos::Details::InnerProductSpaceTraits<value_type>;
+  using norm_type       = typename IPT::mag_type;
+  using norm_view_type  = Kokkos::View<norm_type *, execution_space>;
+
+  ViewType _x;
+  norm_view_type _nrm;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBlasSerialNrm2(const ViewType &x, const norm_view_type &nrm)
+      : _x(x), _nrm(nrm) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const KokkosKernelTag &, const int i) const {
+    auto X  = Kokkos::subview(_x, i, Kokkos::ALL());
+    _nrm(i) = KokkosBlas::serial_nrm2(X);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const NaiveTag &, const int k) const {
+    auto X  = Kokkos::subview(_x, k, Kokkos::ALL());
+    _nrm(k) = Kokkos::ArithTraits<norm_type>::zero();
+    for (int i = 0; i < X.extent_int(0); ++i) {
+      _nrm(k) += IPT::norm(IPT::dot(X(i), X(i)));
+    }
+
+    _nrm(k) = Kokkos::ArithTraits<norm_type>::sqrt(_nrm(k));
+  }
+
+  inline void run() {
+    std::string name_region("KokkosBlas::Test::SerialNrm2");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name_work_tag =
+        (std::is_same<AlgoTagType, KokkosKernelTag>::value
+             ? "::KokkosBlas"
+             : std::is_same<AlgoTagType, NaiveTag>::value ? "::Naive"
+                                                          : "::UnknownWorkTag");
+    std::string name_test_id = "Nrm2";
+    std::string name =
+        name_region + name_value_type + name_work_tag + name_test_id;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType, AlgoTagType> policy(0, _x.extent(0));
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+    return;
+  }
+};
+
+template <typename DeviceType, typename ViewType, typename AlgoTagType>
+struct Functor_TestBlasSerialNrm2MV {
+  using execution_space = typename DeviceType::execution_space;
+  using value_type      = typename ViewType::non_const_value_type;
+  using IPT             = Kokkos::Details::InnerProductSpaceTraits<value_type>;
+  using norm_type       = typename IPT::mag_type;
+  using norm_view_type  = Kokkos::View<norm_type **, execution_space>;
+
+  ViewType _x;
+  norm_view_type _nrm;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBlasSerialNrm2MV(const ViewType &x, const norm_view_type &nrm)
+      : _x(x), _nrm(nrm) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const KokkosKernelTag &, const int i) const {
+    auto X = Kokkos::subview(_x, i, Kokkos::ALL(), Kokkos::ALL());
+    auto R = Kokkos::subview(_nrm, i, Kokkos::ALL());
+    KokkosBlas::serial_nrm2(X, R);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const NaiveTag &, const int k) const {
+    auto X = Kokkos::subview(_x, k, Kokkos::ALL(), Kokkos::ALL());
+    auto R = Kokkos::subview(_nrm, k, Kokkos::ALL());
+
+    for (int colIdx = 0; colIdx < X.extent_int(1); ++colIdx) {
+      R(colIdx) = Kokkos::ArithTraits<norm_type>::zero();
+      for (int rowIdx = 0; rowIdx < X.extent_int(0); ++rowIdx) {
+        R(colIdx) += IPT::norm(IPT::dot(X(rowIdx, colIdx), X(rowIdx, colIdx)));
+      }
+      R(colIdx) = Kokkos::ArithTraits<norm_type>::sqrt(R(colIdx));
+    }
+  }
+
+  inline void run() {
+    std::string name_region("KokkosBlas::Test::SerialNrm2MV");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name_work_tag =
+        (std::is_same<AlgoTagType, KokkosKernelTag>::value
+             ? "::KokkosBlas"
+             : std::is_same<AlgoTagType, NaiveTag>::value ? "::Naive"
+                                                          : "::UnknownWorkTag");
+    std::string name_test_id = "Nrm2";
+    std::string name =
+        name_region + name_value_type + name_work_tag + name_test_id;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType, AlgoTagType> policy(0, _x.extent(0));
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+    return;
+  }
+};
+
+template <typename DeviceType, typename ViewType>
+void impl_test_blas_serial_nrm2(const int N, const int BlkSize) {
+  /// typedefs
+  using execution_space = typename DeviceType::execution_space;
+  using value_type      = typename ViewType::non_const_value_type;
+  using ats             = Kokkos::ArithTraits<value_type>;
+  using IPT             = Kokkos::Details::InnerProductSpaceTraits<value_type>;
+  using norm_type       = typename IPT::mag_type;
+  using norm_view_type  = Kokkos::View<norm_type *, execution_space>;
+
+  /// radomized input testing views
+  ViewType X("X", N, BlkSize);
+  Kokkos::Random_XorShift64_Pool<execution_space> random(13718);
+  Kokkos::fill_random(X, random, ats::one());
+  Kokkos::fence();
+
+  norm_view_type norms("norms", N);
+  norm_view_type norms_ref("ref norms", N);
+
+  /// test body
+  Functor_TestBlasSerialNrm2<DeviceType, ViewType, NaiveTag>(X, norms).run();
+  Functor_TestBlasSerialNrm2<DeviceType, ViewType, KokkosKernelTag>(X,
+                                                                    norms_ref)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  typename norm_view_type::HostMirror norms_host =
+      Kokkos::create_mirror_view(norms);
+  typename norm_view_type::HostMirror norms_ref_host =
+      Kokkos::create_mirror_view(norms_ref);
+
+  Kokkos::deep_copy(norms_host, norms);
+  Kokkos::deep_copy(norms_ref_host, norms_ref);
+
+  /// check a = b
+  typename ats::mag_type eps =
+      100 * std::numeric_limits<typename ats::mag_type>::epsilon();
+  for (int k = 0; k < N; ++k)
+    EXPECT_NEAR_KK(norms_host(k), norms_ref_host(k), eps);
+}
+
+template <typename DeviceType, typename ViewType>
+void impl_test_blas_serial_nrm2mv(const int N, const int vecLength,
+                                  const int numVecs) {
+  /// typedefs
+  using execution_space = typename DeviceType::execution_space;
+  using value_type      = typename ViewType::non_const_value_type;
+  using ats             = Kokkos::ArithTraits<value_type>;
+  using IPT             = Kokkos::Details::InnerProductSpaceTraits<value_type>;
+  using norm_type       = typename IPT::mag_type;
+  using norm_view_type  = Kokkos::View<norm_type **, execution_space>;
+
+  /// radomized input testing views
+  ViewType X("X", N, vecLength, numVecs);
+  Kokkos::Random_XorShift64_Pool<execution_space> random(13718);
+  Kokkos::fill_random(X, random, ats::one());
+  Kokkos::fence();
+
+  norm_view_type norms("norms", N, numVecs);
+  norm_view_type norms_ref("ref norms", N, numVecs);
+
+  /// test body
+  Functor_TestBlasSerialNrm2MV<DeviceType, ViewType, NaiveTag>(X, norms).run();
+  Functor_TestBlasSerialNrm2MV<DeviceType, ViewType, KokkosKernelTag>(X,
+                                                                      norms_ref)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  typename norm_view_type::HostMirror norms_host =
+      Kokkos::create_mirror_view(norms);
+  typename norm_view_type::HostMirror norms_ref_host =
+      Kokkos::create_mirror_view(norms_ref);
+
+  Kokkos::deep_copy(norms_host, norms);
+  Kokkos::deep_copy(norms_ref_host, norms_ref);
+
+  /// check a = b
+  typename ats::mag_type eps =
+      100 * std::numeric_limits<typename ats::mag_type>::epsilon();
+  for (int k = 0; k < N; ++k)
+    for (int vecIdx = 0; vecIdx < numVecs; ++vecIdx)
+      EXPECT_NEAR_KK(norms_host(k, vecIdx), norms_ref_host(k, vecIdx), eps);
+}
+
+}  // namespace Test
+
+template <typename DeviceType, typename ValueType>
+int test_blas_serial_nrm2() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    using ViewType = Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType>;
+    Test::impl_test_blas_serial_nrm2<DeviceType, ViewType>(0, 10);
+    Test::impl_test_blas_serial_nrm2<DeviceType, ViewType>(10, 15);
+    Test::impl_test_blas_serial_nrm2<DeviceType, ViewType>(1024, 9);
+    Test::impl_test_blas_serial_nrm2<DeviceType, ViewType>(132231, 3);
+
+    using MVViewType =
+        Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>;
+    Test::impl_test_blas_serial_nrm2mv<DeviceType, MVViewType>(0, 10, 5);
+    Test::impl_test_blas_serial_nrm2mv<DeviceType, MVViewType>(10, 15, 7);
+    Test::impl_test_blas_serial_nrm2mv<DeviceType, MVViewType>(1024, 9, 5);
+    Test::impl_test_blas_serial_nrm2mv<DeviceType, MVViewType>(132231, 3, 3);
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    using ViewType =
+        Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>;
+    Test::impl_test_blas_serial_nrm2<DeviceType, ViewType>(0, 10);
+    Test::impl_test_blas_serial_nrm2<DeviceType, ViewType>(10, 15);
+    Test::impl_test_blas_serial_nrm2<DeviceType, ViewType>(1024, 9);
+    Test::impl_test_blas_serial_nrm2<DeviceType, ViewType>(132231, 3);
+
+    using MVViewType =
+        Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>;
+    Test::impl_test_blas_serial_nrm2mv<DeviceType, MVViewType>(0, 10, 5);
+    Test::impl_test_blas_serial_nrm2mv<DeviceType, MVViewType>(10, 15, 5);
+    Test::impl_test_blas_serial_nrm2mv<DeviceType, MVViewType>(1024, 9, 5);
+    Test::impl_test_blas_serial_nrm2mv<DeviceType, MVViewType>(132231, 3, 3);
+  }
+#endif
+
+  return 0;
+}
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, serial_nrm2_float_float) {
+  test_blas_serial_nrm2<TestExecSpace, float>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, serial_nrm2_double_double) {
+  test_blas_serial_nrm2<TestExecSpace, double>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT)
+TEST_F(TestCategory, serial_nrm2_fcomplex_float) {
+  test_blas_serial_nrm2<TestExecSpace, Kokkos::complex<float> >();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
+TEST_F(TestCategory, serial_nrm2_dcomplex_dcomplex) {
+  test_blas_serial_nrm2<TestExecSpace, Kokkos::complex<double> >();
+}
+#endif
+
+#endif  // TEST_BLAS_SERIAL_NRM2_HPP_
diff --git a/unit_test/common/Test_Common.hpp b/unit_test/common/Test_Common.hpp
index 0a194071a8..20b875f4a5 100644
--- a/unit_test/common/Test_Common.hpp
+++ b/unit_test/common/Test_Common.hpp
@@ -1,15 +1,12 @@
 #ifndef TEST_COMMON_HPP
 #define TEST_COMMON_HPP
 
-// FIXME_SYCL still some uses of the wrong namespace
-#ifndef KOKKOS_ENABLE_SYCL
 #include <Test_Common_ArithTraits.hpp>
-#endif
 // #include<Test_Common_float128.hpp>
 #include <Test_Common_set_bit_count.hpp>
 #include <Test_Common_Sorting.hpp>
-#include <Test_Common_Transpose.hpp>
 #include <Test_Common_IOUtils.hpp>
 #include <Test_Common_Error.hpp>
+#include <Test_Common_Controls.hpp>
 
 #endif  // TEST_COMMON_HPP
diff --git a/unit_test/common/Test_Common_ArithTraits.hpp b/unit_test/common/Test_Common_ArithTraits.hpp
index 38a6ba7d78..19b0ce9d15 100644
--- a/unit_test/common/Test_Common_ArithTraits.hpp
+++ b/unit_test/common/Test_Common_ArithTraits.hpp
@@ -163,8 +163,8 @@ class ArithTraitsTesterBase {
   /// \brief Combine two intermediate reduction results into \c dst.
   ///
   /// Subclasses need not and must not override this method.
-  KOKKOS_INLINE_FUNCTION void join(volatile value_type& dst,
-                                   const volatile value_type& src) const {
+  KOKKOS_INLINE_FUNCTION void join(value_type& dst,
+                                   const value_type& src) const {
     dst = dst && src;
     // dst = 1;
   }
@@ -1722,6 +1722,10 @@ int runAllArithTraitsHostTests(std::ostream& out, const int verbose) {
   // testArithTraitsOnHost<Kokkos::complex<long double>, DeviceType> (out,
   // verbose);
 
+#if defined(KOKKOS_ENABLE_LIBQUADMATH)
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<__float128, DeviceType>(out, verbose);
+#endif
   return success && curSuccess;
 }
 
diff --git a/unit_test/common/Test_Common_Controls.hpp b/unit_test/common/Test_Common_Controls.hpp
new file mode 100644
index 0000000000..48c2a96715
--- /dev/null
+++ b/unit_test/common/Test_Common_Controls.hpp
@@ -0,0 +1,72 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef TEST_COMMON_CONTROLS_HPP
+#define TEST_COMMON_CONTROLS_HPP
+
+#include "KokkosKernels_Controls.hpp"
+
+void test_controls_empty() {
+  KokkosKernels::Experimental::Controls c;
+  EXPECT_EQ(c.isParameter(""), false);
+  EXPECT_EQ(c.getParameter(""), "");
+  EXPECT_EQ(c.getParameter("", "default"), "default");
+}
+
+void test_controls_set() {
+  KokkosKernels::Experimental::Controls c;
+  c.setParameter("key", "value");
+  EXPECT_EQ(c.isParameter("key"), true);
+  EXPECT_EQ(c.getParameter("key"), "value");
+  EXPECT_EQ(c.getParameter("key", "default"), "value");
+
+  EXPECT_EQ(c.isParameter(""), false);
+  EXPECT_EQ(c.getParameter(""), "");
+  EXPECT_EQ(c.getParameter("", "default"), "default");
+}
+
+TEST_F(TestCategory, controls_empty) { test_controls_empty(); }
+TEST_F(TestCategory, controls_set) { test_controls_set(); }
+
+#endif  // TEST_COMMON_CONTROLS_HPP
diff --git a/unit_test/common/Test_Common_Sorting.hpp b/unit_test/common/Test_Common_Sorting.hpp
index 1580a0c98b..f0320cb637 100644
--- a/unit_test/common/Test_Common_Sorting.hpp
+++ b/unit_test/common/Test_Common_Sorting.hpp
@@ -525,226 +525,6 @@ void testBitonicSortLexicographic() {
   ASSERT_TRUE(ordered);
 }
 
-template <typename exec_space>
-void testSortCRS(default_lno_t numRows, default_lno_t numCols,
-                 default_size_type nnz, bool doValues, bool doStructInterface) {
-  using scalar_t  = default_scalar;
-  using lno_t     = default_lno_t;
-  using size_type = default_size_type;
-  using mem_space = typename exec_space::memory_space;
-  using device_t  = Kokkos::Device<exec_space, mem_space>;
-  using crsMat_t =
-      KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t, void, size_type>;
-  using rowmap_t  = typename crsMat_t::row_map_type;
-  using entries_t = typename crsMat_t::index_type;
-  using values_t  = typename crsMat_t::values_type;
-  // Create a random matrix on device
-  // IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this
-  // wouldn't test anything
-  crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
-      numRows, numCols, nnz, 2, numCols / 2);
-  auto rowmap  = A.graph.row_map;
-  auto entries = A.graph.entries;
-  auto values  = A.values;
-  Kokkos::View<size_type*, Kokkos::HostSpace> rowmapHost("rowmap host",
-                                                         numRows + 1);
-  Kokkos::View<lno_t*, Kokkos::HostSpace> entriesHost("sorted entries host",
-                                                      nnz);
-  Kokkos::View<scalar_t*, Kokkos::HostSpace> valuesHost("sorted values host",
-                                                        nnz);
-  Kokkos::deep_copy(rowmapHost, rowmap);
-  Kokkos::deep_copy(entriesHost, entries);
-  Kokkos::deep_copy(valuesHost, values);
-  struct ColValue {
-    ColValue() {}
-    ColValue(lno_t c, scalar_t v) : col(c), val(v) {}
-    bool operator<(const ColValue& rhs) const { return col < rhs.col; }
-    bool operator==(const ColValue& rhs) const {
-      return col == rhs.col && val == rhs.val;
-    }
-    lno_t col;
-    scalar_t val;
-  };
-  // sort one row at a time on host using STL.
-  {
-    for (lno_t i = 0; i < numRows; i++) {
-      std::vector<ColValue> rowCopy;
-      for (size_type j = rowmapHost(i); j < rowmapHost(i + 1); j++)
-        rowCopy.emplace_back(entriesHost(j), valuesHost(j));
-      std::sort(rowCopy.begin(), rowCopy.end());
-      // write sorted row back
-      for (size_t j = 0; j < rowCopy.size(); j++) {
-        entriesHost(rowmapHost(i) + j) = rowCopy[j].col;
-        valuesHost(rowmapHost(i) + j)  = rowCopy[j].val;
-      }
-    }
-  }
-  // call the actual sort routine being tested
-  if (doValues) {
-    if (doStructInterface) {
-      KokkosKernels::sort_crs_matrix(A);
-    } else {
-      KokkosKernels::sort_crs_matrix<exec_space, rowmap_t, entries_t, values_t>(
-          A.graph.row_map, A.graph.entries, A.values);
-    }
-  } else {
-    if (doStructInterface) {
-      KokkosKernels::sort_crs_graph(A.graph);
-    } else {
-      KokkosKernels::sort_crs_graph<exec_space, rowmap_t, entries_t>(
-          A.graph.row_map, A.graph.entries);
-    }
-  }
-  // Copy to host and compare
-  Kokkos::View<lno_t*, Kokkos::HostSpace> entriesOut("sorted entries host",
-                                                     nnz);
-  Kokkos::View<scalar_t*, Kokkos::HostSpace> valuesOut("sorted values host",
-                                                       nnz);
-  Kokkos::deep_copy(entriesOut, entries);
-  Kokkos::deep_copy(valuesOut, values);
-  for (size_type i = 0; i < nnz; i++) {
-    EXPECT_EQ(entriesHost(i), entriesOut(i))
-        << "Sorted column indices are wrong!";
-    if (doValues) {
-      EXPECT_EQ(valuesHost(i), valuesOut(i)) << "Sorted values are wrong!";
-    }
-  }
-}
-
-template <typename exec_space>
-void testSortCRSUnmanaged(bool doValues, bool doStructInterface) {
-  // This test is about bug #960.
-  using scalar_t  = default_scalar;
-  using lno_t     = default_lno_t;
-  using size_type = default_size_type;
-  using mem_space = typename exec_space::memory_space;
-  using device_t  = Kokkos::Device<exec_space, mem_space>;
-  using crsMat_t =
-      KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t,
-                              Kokkos::MemoryTraits<Kokkos::Unmanaged>,
-                              size_type>;
-  using crsMat_Managed_t =
-      KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t, void, size_type>;
-  using rowmap_t      = typename crsMat_t::row_map_type;
-  using entries_t     = typename crsMat_t::index_type;
-  using values_t      = typename crsMat_t::values_type;
-  const lno_t numRows = 50;
-  const lno_t numCols = numRows;
-  size_type nnz       = numRows * 5;
-  // Create a random matrix on device
-  // IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this
-  // wouldn't test anything
-  crsMat_Managed_t A_managed =
-      KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_Managed_t>(
-          numRows, numCols, nnz, 2, numCols / 2);
-  crsMat_t A(A_managed);
-  auto rowmap  = A.graph.row_map;
-  auto entries = A.graph.entries;
-  auto values  = A.values;
-  if (doValues) {
-    if (doStructInterface) {
-      KokkosKernels::sort_crs_matrix(A);
-    } else {
-      KokkosKernels::sort_crs_matrix<exec_space, rowmap_t, entries_t, values_t>(
-          A.graph.row_map, A.graph.entries, A.values);
-    }
-  } else {
-    if (doStructInterface) {
-      KokkosKernels::sort_crs_graph(A.graph);
-    } else {
-      KokkosKernels::sort_crs_graph<exec_space, rowmap_t, entries_t>(
-          A.graph.row_map, A.graph.entries);
-    }
-  }
-}
-
-template <typename exec_space>
-void testSortAndMerge() {
-  using size_type = default_size_type;
-  using lno_t     = default_lno_t;
-  using scalar_t  = default_scalar;
-  using mem_space = typename exec_space::memory_space;
-  using device_t  = Kokkos::Device<exec_space, mem_space>;
-  using crsMat_t =
-      KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t, void, size_type>;
-  using rowmap_t  = typename crsMat_t::row_map_type::non_const_type;
-  using entries_t = typename crsMat_t::index_type;
-  using values_t  = typename crsMat_t::values_type;
-  using Kokkos::HostSpace;
-  using Kokkos::MemoryTraits;
-  using Kokkos::Unmanaged;
-  // Create a small CRS matrix on host
-  std::vector<size_type> inRowmap = {0, 4, 4, 5, 7, 10};
-  std::vector<lno_t> inEntries    = {
-      4, 3, 5, 3,  // row 0
-                   // row 1 has no entries
-      6,           // row 2
-      2, 2,        // row 3
-      0, 1, 2      // row 4
-  };
-  // note: choosing values that can be represented exactly by float
-  std::vector<scalar_t> inValues = {
-      1.5, 4, 1, -3,  // row 0
-                      // row 1
-      2,              // row 2
-      -1, -2,         // row 3
-      0, 3.5, -2.25   // row 4
-  };
-  lno_t nrows   = 5;
-  lno_t ncols   = 7;
-  size_type nnz = inEntries.size();
-  Kokkos::View<size_type*, HostSpace, MemoryTraits<Unmanaged>> hostInRowmap(
-      inRowmap.data(), nrows + 1);
-  Kokkos::View<lno_t*, HostSpace, MemoryTraits<Unmanaged>> hostInEntries(
-      inEntries.data(), nnz);
-  Kokkos::View<scalar_t*, HostSpace, MemoryTraits<Unmanaged>> hostInValues(
-      inValues.data(), nnz);
-  rowmap_t devInRowmap("", nrows + 1);
-  entries_t devInEntries("", nnz);
-  values_t devInValues("", nnz);
-  Kokkos::deep_copy(devInRowmap, hostInRowmap);
-  Kokkos::deep_copy(devInEntries, hostInEntries);
-  Kokkos::deep_copy(devInValues, hostInValues);
-  crsMat_t input("Input", nrows, ncols, nnz, devInValues, devInRowmap,
-                 devInEntries);
-  crsMat_t output = KokkosKernels::sort_and_merge_matrix(input);
-  exec_space().fence();
-  EXPECT_EQ(output.numRows(), nrows);
-  EXPECT_EQ(output.numCols(), ncols);
-  auto outRowmap  = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
-                                                       output.graph.row_map);
-  auto outEntries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
-                                                        output.graph.entries);
-  auto outValues =
-      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), output.values);
-  // Expect 2 merges to have taken place
-  std::vector<size_type> goldRowmap = {0, 3, 3, 4, 5, 8};
-  std::vector<lno_t> goldEntries    = {
-      3, 4, 5,  // row 0
-                // row 1 has no entries
-      6,        // row 2
-      2,        // row 3
-      0, 1, 2   // row 4
-  };
-  // note: choosing values that can be represented exactly by float
-  std::vector<scalar_t> goldValues = {
-      1, 1.5, 1,     // row 0
-                     // row 1
-      2,             // row 2
-      -3,            // row 3
-      0, 3.5, -2.25  // row 4
-  };
-  EXPECT_EQ(goldRowmap.size(), outRowmap.extent(0));
-  EXPECT_EQ(goldEntries.size(), outEntries.extent(0));
-  EXPECT_EQ(goldValues.size(), outValues.extent(0));
-  EXPECT_EQ(goldValues.size(), output.nnz());
-  for (lno_t i = 0; i < nrows + 1; i++) EXPECT_EQ(goldRowmap[i], outRowmap(i));
-  for (size_type i = 0; i < output.nnz(); i++) {
-    EXPECT_EQ(goldEntries[i], outEntries(i));
-    EXPECT_EQ(goldValues[i], outValues(i));
-  }
-}
-
 TEST_F(TestCategory, common_serial_radix) {
   // Test serial radix over some contiguous small arrays
   // 1st arg is #arrays, 2nd arg is max subarray size
@@ -805,31 +585,4 @@ TEST_F(TestCategory, common_device_bitonic) {
   testBitonicSortLexicographic<TestExecSpace>();
 }
 
-TEST_F(TestCategory, common_sort_crsgraph) {
-  for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) {
-    testSortCRS<TestExecSpace>(10, 10, 20, false, doStructInterface);
-    testSortCRS<TestExecSpace>(100, 100, 2000, false, doStructInterface);
-    testSortCRS<TestExecSpace>(1000, 1000, 30000, false, doStructInterface);
-    testSortCRSUnmanaged<TestExecSpace>(false, doStructInterface);
-  }
-}
-
-TEST_F(TestCategory, common_sort_crsmatrix) {
-  for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) {
-    testSortCRS<TestExecSpace>(10, 10, 20, true, doStructInterface);
-    testSortCRS<TestExecSpace>(100, 100, 2000, true, doStructInterface);
-    testSortCRS<TestExecSpace>(1000, 1000, 30000, true, doStructInterface);
-    testSortCRSUnmanaged<TestExecSpace>(true, doStructInterface);
-  }
-}
-
-TEST_F(TestCategory, common_sort_crs_longrows) {
-  testSortCRS<TestExecSpace>(1, 50000, 10000, false, false);
-  testSortCRS<TestExecSpace>(1, 50000, 10000, true, false);
-}
-
-TEST_F(TestCategory, common_sort_merge_crsmatrix) {
-  testSortAndMerge<TestExecSpace>();
-}
-
 #endif
diff --git a/unit_test/common/Test_Common_Test_All_Type_Combos.hpp b/unit_test/common/Test_Common_Test_All_Type_Combos.hpp
new file mode 100644
index 0000000000..afacb09ee9
--- /dev/null
+++ b/unit_test/common/Test_Common_Test_All_Type_Combos.hpp
@@ -0,0 +1,190 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Test_Common_Test_All_Type_Combos.hpp
+
+/**
+ * KOKKOSKERNELS_EXECUTE_TEST should take (SCALAR, ORDINAL, OFFSET, DEVICE). All
+ * these args are types.
+ * #define NO_TEST_COMPLEX to skip testing of kokkos complex types
+ */
+
+#if !defined(KOKKOSKERNELS_EXECUTE_TEST)
+#error Test_Common_Test_All_Type_Combos.hpp requires KOKKOSKERNELS_EXECUTE_TEST to be set
+#endif
+
+#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+
+// ETI is off, test all possible type combos
+
+KOKKOSKERNELS_EXECUTE_TEST(double, int, int, TestExecSpace)
+KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+KOKKOSKERNELS_EXECUTE_TEST(double, int, size_t, TestExecSpace)
+KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+KOKKOSKERNELS_EXECUTE_TEST(float, int, int, TestExecSpace)
+KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, int, TestExecSpace)
+KOKKOSKERNELS_EXECUTE_TEST(float, int, size_t, TestExecSpace)
+KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
+
+#if !defined(NO_TEST_COMPLEX)
+
+KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
+KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
+KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
+KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, size_t,
+                           TestExecSpace)
+KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
+KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
+KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
+KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
+
+#endif
+
+#else
+
+// ETI is on, only test instantiated type combos
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT))
+KOKKOSKERNELS_EXECUTE_TEST(double, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT))
+KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T))
+KOKKOSKERNELS_EXECUTE_TEST(double, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T))
+KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT))
+KOKKOSKERNELS_EXECUTE_TEST(float, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT))
+KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T))
+KOKKOSKERNELS_EXECUTE_TEST(float, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T))
+KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
+#endif
+
+#if !defined(NO_TEST_COMPLEX)
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT))
+KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT))
+KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T))
+KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T))
+KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, size_t,
+                           TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT))
+KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT))
+KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T))
+KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T))
+KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
+#endif
+
+#endif  // !NO_TEST_COMPLEX
+
+#endif  // ETI ON
diff --git a/unit_test/common/Test_Common_Transpose.hpp b/unit_test/common/Test_Common_Transpose.hpp
deleted file mode 100644
index fba29da81d..0000000000
--- a/unit_test/common/Test_Common_Transpose.hpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-/// \file Test_Common_Transpose.hpp
-
-#ifndef KOKKOSKERNELS_TRANSPOSE_HPP
-#define KOKKOSKERNELS_TRANSPOSE_HPP
-
-#include <Kokkos_Core.hpp>
-#include <Kokkos_Sort.hpp>
-#include <KokkosKernels_SparseUtils.hpp>
-#include <KokkosKernels_Sorting.hpp>
-#include <KokkosKernels_IOUtils.hpp>
-#include <KokkosKernels_default_types.hpp>
-#include <KokkosSparse_CrsMatrix.hpp>
-
-template <typename size_type, typename V>
-struct ExactCompare {
-  ExactCompare(const V& v1_, const V& v2_) : v1(v1_), v2(v2_) {}
-
-  KOKKOS_INLINE_FUNCTION void operator()(size_type i, size_type& ldiffs) const {
-    if (v1(i) != v2(i)) ldiffs++;
-  }
-
-  V v1;
-  V v2;
-};
-
-template <typename exec_space>
-void testTranspose(int numRows, int numCols, bool doValues) {
-  using range_pol  = Kokkos::RangePolicy<exec_space>;
-  using scalar_t   = default_scalar;
-  using lno_t      = default_lno_t;
-  using size_type  = default_size_type;
-  using mem_space  = typename exec_space::memory_space;
-  using device_t   = Kokkos::Device<exec_space, mem_space>;
-  using crsMat_t   = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t,
-                                                    void, size_type>;
-  using c_rowmap_t = typename crsMat_t::row_map_type;
-  using c_entries_t = typename crsMat_t::index_type;
-  using c_values_t  = typename crsMat_t::values_type;
-  using rowmap_t    = typename crsMat_t::row_map_type::non_const_type;
-  using entries_t   = typename crsMat_t::index_type::non_const_type;
-  using values_t    = typename crsMat_t::values_type::non_const_type;
-  size_type nnz     = 10 * numRows;
-  // Generate a matrix that has 0 entries in some rows
-  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
-      numRows, numCols, nnz, 3 * 10, numRows / 2);
-  // compute the transpose while unsorted, then transpose again
-  rowmap_t t_rowmap("Rowmap^T", numCols + 1);  // this view is initialized to 0
-  entries_t t_entries(
-      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries^T"),
-      input_mat.graph.entries.extent(0));
-  values_t t_values(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values^T"),
-                    input_mat.values.extent(0));
-  rowmap_t tt_rowmap("Rowmap^T^T",
-                     numRows + 1);  // this view is initialized to 0
-  entries_t tt_entries(
-      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries^T^T"),
-      input_mat.graph.entries.extent(0));
-  values_t tt_values(
-      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values^T"),
-      input_mat.values.extent(0));
-  if (doValues) {
-    KokkosKernels::Impl::transpose_matrix<c_rowmap_t, c_entries_t, c_values_t,
-                                          rowmap_t, entries_t, values_t,
-                                          rowmap_t, exec_space>(
-        numRows, numCols, input_mat.graph.row_map, input_mat.graph.entries,
-        input_mat.values, t_rowmap, t_entries, t_values);
-    KokkosKernels::Impl::transpose_matrix<rowmap_t, entries_t, values_t,
-                                          rowmap_t, entries_t, values_t,
-                                          rowmap_t, exec_space>(
-        numCols, numRows, t_rowmap, t_entries, t_values, tt_rowmap, tt_entries,
-        tt_values);
-  } else {
-    KokkosKernels::Impl::transpose_graph<c_rowmap_t, c_entries_t, rowmap_t,
-                                         entries_t, rowmap_t, exec_space>(
-        numRows, numCols, input_mat.graph.row_map, input_mat.graph.entries,
-        t_rowmap, t_entries);
-    KokkosKernels::Impl::transpose_graph<rowmap_t, entries_t, rowmap_t,
-                                         entries_t, rowmap_t, exec_space>(
-        numCols, numRows, t_rowmap, t_entries, tt_rowmap, tt_entries);
-  }
-  // Sort both the transpose-transpose, and the original matrix (to compare
-  // directly)
-  KokkosKernels::sort_crs_matrix(input_mat);
-  KokkosKernels::sort_crs_matrix<exec_space, c_rowmap_t, entries_t, values_t>(
-      tt_rowmap, tt_entries, tt_values);
-  // The views should now be exactly identical, since they represent the same
-  // matrix and are sorted
-  size_type rowmapDiffs;
-  Kokkos::parallel_reduce(
-      range_pol(0, numRows + 1),
-      ExactCompare<size_type, c_rowmap_t>(input_mat.graph.row_map, tt_rowmap),
-      rowmapDiffs);
-  size_type entriesDiffs;
-  Kokkos::parallel_reduce(
-      range_pol(0, input_mat.nnz()),
-      ExactCompare<size_type, c_entries_t>(input_mat.graph.entries, tt_entries),
-      entriesDiffs);
-  EXPECT_EQ(size_type(0), rowmapDiffs);
-  EXPECT_EQ(size_type(0), entriesDiffs);
-  if (doValues) {
-    size_type valuesDiffs;
-    Kokkos::parallel_reduce(
-        range_pol(0, input_mat.nnz()),
-        ExactCompare<size_type, values_t>(input_mat.values, tt_values),
-        valuesDiffs);
-    EXPECT_EQ(size_type(0), valuesDiffs);
-  }
-}
-
-TEST_F(TestCategory, common_transpose_matrix) {
-  // Test both matrix and graph transpose with various sizes
-  testTranspose<TestExecSpace>(100, 100, true);
-  testTranspose<TestExecSpace>(500, 50, true);
-  testTranspose<TestExecSpace>(50, 500, true);
-  testTranspose<TestExecSpace>(4000, 2000, true);
-  testTranspose<TestExecSpace>(2000, 4000, true);
-  testTranspose<TestExecSpace>(2000, 2000, true);
-}
-
-TEST_F(TestCategory, common_transpose_graph) {
-  testTranspose<TestExecSpace>(100, 100, false);
-  testTranspose<TestExecSpace>(500, 50, false);
-  testTranspose<TestExecSpace>(50, 500, false);
-  testTranspose<TestExecSpace>(4000, 2000, false);
-  testTranspose<TestExecSpace>(2000, 4000, false);
-  testTranspose<TestExecSpace>(2000, 2000, false);
-}
-
-#endif
diff --git a/unit_test/common/Test_Common_set_bit_count.hpp b/unit_test/common/Test_Common_set_bit_count.hpp
index a085cc0024..937a2fdf1b 100644
--- a/unit_test/common/Test_Common_set_bit_count.hpp
+++ b/unit_test/common/Test_Common_set_bit_count.hpp
@@ -48,13 +48,9 @@
 #include "KokkosKernels_BitUtils.hpp"
 #include "KokkosKernels_SimpleUtils.hpp"
 #include "KokkosKernels_PrintUtils.hpp"
-#include <Kokkos_Concepts.hpp>
 #include <string>
 #include <stdexcept>
 
-#include <gtest/gtest.h>
-#include <Kokkos_Core.hpp>
-
 // const char *input_filename = "sherman1.mtx";
 // const char *input_filename = "Si2.mtx";
 // const char *input_filename = "wathen_30_30.mtx";
diff --git a/unit_test/graph/Test_Graph_graph_color.hpp b/unit_test/graph/Test_Graph_graph_color.hpp
index ef7c14a931..4d35874657 100644
--- a/unit_test/graph/Test_Graph_graph_color.hpp
+++ b/unit_test/graph/Test_Graph_graph_color.hpp
@@ -47,9 +47,10 @@
 
 #include "KokkosGraph_Distance1Color.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
-#include "KokkosKernels_IOUtils.hpp"
-#include "KokkosKernels_SparseUtils.hpp"
+#include "KokkosSparse_IOUtils.hpp"
+#include "KokkosSparse_Utils.hpp"
 #include "KokkosKernels_Handle.hpp"
+#include "KokkosKernels_default_types.hpp"
 
 using namespace KokkosKernels;
 using namespace KokkosKernels::Experimental;
@@ -115,7 +116,7 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth,
   // typedef typename lno_view_t::non_const_value_type size_type;
 
   lno_t numCols      = numRows;
-  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+  crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat_t>(
       numRows, numCols, nnz, row_size_variance, bandwidth);
 
   typename lno_view_t::non_const_type sym_xadj;
@@ -168,7 +169,7 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth,
 
     const lno_t num_rows_1 = input_mat.numRows();
     const lno_t num_cols_1 = input_mat.numCols();
-    lno_t num_conflict     = KokkosKernels::Impl::kk_is_d1_coloring_valid<
+    lno_t num_conflict     = KokkosSparse::Impl::kk_is_d1_coloring_valid<
         lno_view_t, lno_nnz_view_t, color_view_t,
         typename device::execution_space>(
         num_rows_1, num_cols_1, input_mat.graph.row_map,
@@ -220,31 +221,28 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth,
      defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
     (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
      !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, int, TestExecSpace)
+EXECUTE_TEST(default_scalar, int, int, TestExecSpace)
 #endif
 
 #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
      defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
     (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
      !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+EXECUTE_TEST(default_scalar, int64_t, int, TestExecSpace)
 #endif
 
-// FIXME_SYCL
-#ifndef KOKKOS_ENABLE_SYCL
 #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
      defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
     (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
      !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, size_t, TestExecSpace)
-#endif
+EXECUTE_TEST(default_scalar, int, size_t, TestExecSpace)
 #endif
 
 #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
      defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
     (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
      !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+EXECUTE_TEST(default_scalar, int64_t, size_t, TestExecSpace)
 #endif
 
 #undef EXECUTE_TEST
diff --git a/unit_test/graph/Test_Graph_graph_color_deterministic.hpp b/unit_test/graph/Test_Graph_graph_color_deterministic.hpp
index ec718e9aa4..e2e4a3d227 100644
--- a/unit_test/graph/Test_Graph_graph_color_deterministic.hpp
+++ b/unit_test/graph/Test_Graph_graph_color_deterministic.hpp
@@ -48,8 +48,9 @@
 #include "KokkosGraph_Distance1Color.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
 #include "KokkosKernels_IOUtils.hpp"
-#include "KokkosKernels_SparseUtils.hpp"
+#include "KokkosSparse_Utils.hpp"
 #include "KokkosKernels_Handle.hpp"
+#include "KokkosKernels_default_types.hpp"
 
 using namespace KokkosKernels;
 using namespace KokkosKernels::Experimental;
@@ -274,28 +275,28 @@ void test_coloring_deterministic(lno_t numRows, size_type nnz) {
      defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
     (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
      !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, int, TestExecSpace)
+EXECUTE_TEST(default_scalar, int, int, TestExecSpace)
 #endif
 
 #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
      defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
     (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
      !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+EXECUTE_TEST(default_scalar, int64_t, int, TestExecSpace)
 #endif
 
 #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
      defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
     (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
      !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, size_t, TestExecSpace)
+EXECUTE_TEST(default_scalar, int, size_t, TestExecSpace)
 #endif
 
 #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
      defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
     (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
      !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+EXECUTE_TEST(default_scalar, int64_t, size_t, TestExecSpace)
 #endif
 
 #undef EXECUTE_TEST
diff --git a/unit_test/graph/Test_Graph_graph_color_distance2.hpp b/unit_test/graph/Test_Graph_graph_color_distance2.hpp
index 70158941a8..c78e8c2f5f 100644
--- a/unit_test/graph/Test_Graph_graph_color_distance2.hpp
+++ b/unit_test/graph/Test_Graph_graph_color_distance2.hpp
@@ -49,8 +49,8 @@
 #include "KokkosGraph_Distance2Color.hpp"
 #include "KokkosGraph_MIS2.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
-#include "KokkosKernels_IOUtils.hpp"
-#include "KokkosKernels_SparseUtils.hpp"
+#include "KokkosSparse_IOUtils.hpp"
+#include "KokkosSparse_Utils.hpp"
 #include "KokkosKernels_Handle.hpp"
 #include "KokkosKernels_ExecSpaceUtils.hpp"
 
@@ -159,7 +159,7 @@ void test_dist2_coloring(lno_t numVerts, size_type nnz, lno_t bandwidth,
       KokkosKernelsHandle<size_type, lno_t, double, execution_space,
                           memory_space, memory_space>;
   // Generate graph, and add some out-of-bounds columns
-  crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat>(
+  crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat>(
       numVerts, numVerts, nnz, row_size_variance, bandwidth);
   auto G = A.graph;
   // Symmetrize the graph
@@ -216,7 +216,7 @@ void test_bipartite_symmetric(lno_t numVerts, size_type nnz, lno_t bandwidth,
       KokkosKernelsHandle<size_type, lno_t, double, execution_space,
                           memory_space, memory_space>;
   // Generate graph, and add some out-of-bounds columns
-  crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat>(
+  crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat>(
       numVerts, numVerts, nnz, row_size_variance, bandwidth);
   auto G = A.graph;
   // Symmetrize the graph
@@ -273,13 +273,13 @@ void test_bipartite(lno_t numRows, lno_t numCols, size_type nnz,
       KokkosKernelsHandle<size_type, lno_t, double, execution_space,
                           memory_space, memory_space>;
   // Generate graph
-  crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat>(
+  crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat>(
       numRows, numCols, nnz, row_size_variance, bandwidth);
   auto G = A.graph;
   rowmap_t t_rowmap("rowmap^T", numCols + 1);
   entries_t t_entries("entries^T", G.entries.extent(0));
-  KokkosKernels::Impl::transpose_graph<c_rowmap_t, c_entries_t, rowmap_t,
-                                       entries_t, rowmap_t, execution_space>(
+  KokkosSparse::Impl::transpose_graph<c_rowmap_t, c_entries_t, rowmap_t,
+                                      entries_t, rowmap_t, execution_space>(
       numRows, numCols, G.row_map, G.entries, t_rowmap, t_entries);
   // TODO: remove me, shouldn't be needed even with UVM
   execution_space().fence();
diff --git a/unit_test/graph/Test_Graph_mis2.hpp b/unit_test/graph/Test_Graph_mis2.hpp
index ed3acc3b85..c1b5e179fe 100644
--- a/unit_test/graph/Test_Graph_mis2.hpp
+++ b/unit_test/graph/Test_Graph_mis2.hpp
@@ -50,7 +50,8 @@
 #include "KokkosGraph_ExplicitCoarsening.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
 #include "KokkosKernels_IOUtils.hpp"
-#include "KokkosKernels_SparseUtils.hpp"
+#include "KokkosSparse_IOUtils.hpp"
+#include "KokkosSparse_Utils.hpp"
 #include "KokkosKernels_Handle.hpp"
 #include "KokkosKernels_ExecSpaceUtils.hpp"
 
@@ -122,7 +123,7 @@ void test_mis2(lno_t numVerts, size_type nnz, lno_t bandwidth,
   using rowmap_t    = typename c_rowmap_t::non_const_type;
   using entries_t   = typename c_entries_t::non_const_type;
   // Generate graph, and add some out-of-bounds columns
-  crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat>(
+  crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat>(
       numVerts, numVerts, nnz, row_size_variance, bandwidth);
   auto G = A.graph;
   // Symmetrize the graph
@@ -164,7 +165,7 @@ void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth,
   using entries_t   = typename c_entries_t::non_const_type;
   using labels_t    = entries_t;
   // Generate graph, and add some out-of-bounds columns
-  crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat>(
+  crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat>(
       numVerts, numVerts, nnz, row_size_variance, bandwidth);
   auto G = A.graph;
   // Symmetrize the graph
diff --git a/unit_test/sparse/Test_Sparse.hpp b/unit_test/sparse/Test_Sparse.hpp
index 2afa0fb2db..e75eb1ce6a 100644
--- a/unit_test/sparse/Test_Sparse.hpp
+++ b/unit_test/sparse/Test_Sparse.hpp
@@ -12,12 +12,17 @@
 #include "Test_Sparse_spadd.hpp"
 #include "Test_Sparse_spgemm_jacobi.hpp"
 #include "Test_Sparse_spgemm.hpp"
+#include "Test_Sparse_bspgemm.hpp"
+#include "Test_Sparse_SortCrs.hpp"
 #include "Test_Sparse_spiluk.hpp"
 #include "Test_Sparse_spmv.hpp"
-//#include "Test_Sparse_spmv_blockcrs.hpp"
-//#include "Test_Sparse_spmv_bsr.hpp"
+#include "Test_Sparse_spmv_blockcrs.hpp"
+#include "Test_Sparse_spmv_bsr.hpp"
 #include "Test_Sparse_sptrsv.hpp"
 #include "Test_Sparse_trsv.hpp"
+#include "Test_Sparse_Transpose.hpp"
+#include "Test_Sparse_TestUtils_RandCscMat.hpp"
+#include "Test_Sparse_csc2csr.hpp"
 
 // TPL specific tests, these require
 // particular pairs of backend and TPL
diff --git a/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp b/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp
index e87514c3c6..6eb4488c72 100644
--- a/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp
+++ b/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp
@@ -372,139 +372,13 @@ void testBlockCrsMatrix() {
   }
 }
 
-#define EXECUTE_BLOCKCRS_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                \
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)           \
   TEST_F(                                                                     \
       TestCategory,                                                           \
       sparse##_##blkcrsmatrix##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
     testBlockCrsMatrix<SCALAR, ORDINAL, OFFSET, DEVICE>();                    \
   }
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BLOCKCRS_TEST(double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BLOCKCRS_TEST(double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BLOCKCRS_TEST(double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BLOCKCRS_TEST(double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BLOCKCRS_TEST(float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BLOCKCRS_TEST(float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BLOCKCRS_TEST(float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BLOCKCRS_TEST(float, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
-#endif
-
-#undef EXECUTE_BLOCKCRS_TEST
+#include <Test_Common_Test_All_Type_Combos.hpp>
+
+#undef KOKKOSKERNELS_EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_BsrMatrix.hpp b/unit_test/sparse/Test_Sparse_BsrMatrix.hpp
index 49a0ce6d4f..501ebc2ead 100644
--- a/unit_test/sparse/Test_Sparse_BsrMatrix.hpp
+++ b/unit_test/sparse/Test_Sparse_BsrMatrix.hpp
@@ -374,138 +374,12 @@ void testBsrMatrix() {
   }
 }
 
-#define EXECUTE_BSR_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                     \
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)           \
   TEST_F(TestCategory,                                                        \
          sparse##_##bsrmatrix##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
     testBsrMatrix<SCALAR, ORDINAL, OFFSET, DEVICE>();                         \
   }
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TEST(double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TEST(double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TEST(double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TEST(double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TEST(float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TEST(float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TEST(float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TEST(float, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TEST(kokkos_complex_double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TEST(kokkos_complex_float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
-#endif
-
-#undef EXECUTE_BSR_TEST
+#include <Test_Common_Test_All_Type_Combos.hpp>
+
+#undef KOKKOSKERNELS_EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_CrsMatrix.hpp b/unit_test/sparse/Test_Sparse_CrsMatrix.hpp
index 6f67a6e8bb..8a85e43670 100644
--- a/unit_test/sparse/Test_Sparse_CrsMatrix.hpp
+++ b/unit_test/sparse/Test_Sparse_CrsMatrix.hpp
@@ -245,7 +245,7 @@ void testCrsMatrixHostMirror() {
   EXPECT_EQ(zeroHost.graph.row_map.extent(0), 0);
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                                  \
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                    \
   TEST_F(TestCategory,                                                                 \
          sparse##_##crsmatrix##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {          \
     testCrsMatrix<SCALAR, ORDINAL, OFFSET, DEVICE>();                                  \
@@ -257,132 +257,6 @@ void testCrsMatrixHostMirror() {
     testCrsMatrixHostMirror<SCALAR, ORDINAL, OFFSET, DEVICE>();                        \
   }
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, int, TestExecSpace)
-#endif
+#include <Test_Common_Test_All_Type_Combos.hpp>
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
-#endif
-
-#undef EXECUTE_TEST
+#undef KOKKOSKERNELS_EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_SortCrs.hpp b/unit_test/sparse/Test_Sparse_SortCrs.hpp
new file mode 100644
index 0000000000..a4d30b40a1
--- /dev/null
+++ b/unit_test/sparse/Test_Sparse_SortCrs.hpp
@@ -0,0 +1,310 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Test_Sparse_SortCrs.hpp
+/// \brief Tests for sort_crs_matrix and sort_crs_graph in
+/// KokkosSparse_SortCrs.hpp
+
+#ifndef KOKKOSSPARSE_SORTCRSTEST_HPP
+#define KOKKOSSPARSE_SORTCRSTEST_HPP
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Sort.hpp>
+#include <KokkosKernels_Utils.hpp>
+#include "KokkosSparse_IOUtils.hpp"
+#include <KokkosSparse_SortCrs.hpp>
+#include <KokkosKernels_default_types.hpp>
+#include <KokkosSparse_CrsMatrix.hpp>
+#include <Kokkos_ArithTraits.hpp>
+#include <Kokkos_Complex.hpp>
+#include <cstdlib>
+
+template <typename exec_space>
+void testSortCRS(default_lno_t numRows, default_lno_t numCols,
+                 default_size_type nnz, bool doValues, bool doStructInterface) {
+  using scalar_t  = default_scalar;
+  using lno_t     = default_lno_t;
+  using size_type = default_size_type;
+  using mem_space = typename exec_space::memory_space;
+  using device_t  = Kokkos::Device<exec_space, mem_space>;
+  using crsMat_t =
+      KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t, void, size_type>;
+  using rowmap_t  = typename crsMat_t::row_map_type;
+  using entries_t = typename crsMat_t::index_type;
+  using values_t  = typename crsMat_t::values_type;
+  // Create a random matrix on device
+  // IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this
+  // wouldn't test anything
+  crsMat_t A = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat_t>(
+      numRows, numCols, nnz, 2, numCols / 2);
+  auto rowmap  = A.graph.row_map;
+  auto entries = A.graph.entries;
+  auto values  = A.values;
+  Kokkos::View<size_type*, Kokkos::HostSpace> rowmapHost("rowmap host",
+                                                         numRows + 1);
+  Kokkos::View<lno_t*, Kokkos::HostSpace> entriesHost("sorted entries host",
+                                                      nnz);
+  Kokkos::View<scalar_t*, Kokkos::HostSpace> valuesHost("sorted values host",
+                                                        nnz);
+  Kokkos::deep_copy(rowmapHost, rowmap);
+  Kokkos::deep_copy(entriesHost, entries);
+  Kokkos::deep_copy(valuesHost, values);
+  struct ColValue {
+    ColValue() {}
+    ColValue(lno_t c, scalar_t v) : col(c), val(v) {}
+    bool operator<(const ColValue& rhs) const { return col < rhs.col; }
+    bool operator==(const ColValue& rhs) const {
+      return col == rhs.col && val == rhs.val;
+    }
+    lno_t col;
+    scalar_t val;
+  };
+  // sort one row at a time on host using STL.
+  {
+    for (lno_t i = 0; i < numRows; i++) {
+      std::vector<ColValue> rowCopy;
+      for (size_type j = rowmapHost(i); j < rowmapHost(i + 1); j++)
+        rowCopy.emplace_back(entriesHost(j), valuesHost(j));
+      std::sort(rowCopy.begin(), rowCopy.end());
+      // write sorted row back
+      for (size_t j = 0; j < rowCopy.size(); j++) {
+        entriesHost(rowmapHost(i) + j) = rowCopy[j].col;
+        valuesHost(rowmapHost(i) + j)  = rowCopy[j].val;
+      }
+    }
+  }
+  // call the actual sort routine being tested
+  if (doValues) {
+    if (doStructInterface) {
+      KokkosSparse::sort_crs_matrix(A);
+    } else {
+      KokkosSparse::sort_crs_matrix<exec_space, rowmap_t, entries_t, values_t>(
+          A.graph.row_map, A.graph.entries, A.values);
+    }
+  } else {
+    if (doStructInterface) {
+      KokkosSparse::sort_crs_graph(A.graph);
+    } else {
+      KokkosSparse::sort_crs_graph<exec_space, rowmap_t, entries_t>(
+          A.graph.row_map, A.graph.entries);
+    }
+  }
+  // Copy to host and compare
+  Kokkos::View<lno_t*, Kokkos::HostSpace> entriesOut("sorted entries host",
+                                                     nnz);
+  Kokkos::View<scalar_t*, Kokkos::HostSpace> valuesOut("sorted values host",
+                                                       nnz);
+  Kokkos::deep_copy(entriesOut, entries);
+  Kokkos::deep_copy(valuesOut, values);
+  for (size_type i = 0; i < nnz; i++) {
+    EXPECT_EQ(entriesHost(i), entriesOut(i))
+        << "Sorted column indices are wrong!";
+    if (doValues) {
+      EXPECT_EQ(valuesHost(i), valuesOut(i)) << "Sorted values are wrong!";
+    }
+  }
+}
+
+template <typename exec_space>
+void testSortCRSUnmanaged(bool doValues, bool doStructInterface) {
+  // This test is about bug #960.
+  using scalar_t  = default_scalar;
+  using lno_t     = default_lno_t;
+  using size_type = default_size_type;
+  using mem_space = typename exec_space::memory_space;
+  using device_t  = Kokkos::Device<exec_space, mem_space>;
+  using crsMat_t =
+      KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>,
+                              size_type>;
+  using crsMat_Managed_t =
+      KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t, void, size_type>;
+  using rowmap_t      = typename crsMat_t::row_map_type;
+  using entries_t     = typename crsMat_t::index_type;
+  using values_t      = typename crsMat_t::values_type;
+  const lno_t numRows = 50;
+  const lno_t numCols = numRows;
+  size_type nnz       = numRows * 5;
+  // Create a random matrix on device
+  // IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this
+  // wouldn't test anything
+  crsMat_Managed_t A_managed =
+      KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat_Managed_t>(
+          numRows, numCols, nnz, 2, numCols / 2);
+  crsMat_t A(A_managed);
+  auto rowmap  = A.graph.row_map;
+  auto entries = A.graph.entries;
+  auto values  = A.values;
+  if (doValues) {
+    if (doStructInterface) {
+      KokkosSparse::sort_crs_matrix(A);
+    } else {
+      KokkosSparse::sort_crs_matrix<exec_space, rowmap_t, entries_t, values_t>(
+          A.graph.row_map, A.graph.entries, A.values);
+    }
+  } else {
+    if (doStructInterface) {
+      KokkosSparse::sort_crs_graph(A.graph);
+    } else {
+      KokkosSparse::sort_crs_graph<exec_space, rowmap_t, entries_t>(
+          A.graph.row_map, A.graph.entries);
+    }
+  }
+}
+
+template <typename exec_space>
+void testSortAndMerge() {
+  using size_type = default_size_type;
+  using lno_t     = default_lno_t;
+  using scalar_t  = default_scalar;
+  using mem_space = typename exec_space::memory_space;
+  using device_t  = Kokkos::Device<exec_space, mem_space>;
+  using crsMat_t =
+      KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t, void, size_type>;
+  using rowmap_t  = typename crsMat_t::row_map_type::non_const_type;
+  using entries_t = typename crsMat_t::index_type;
+  using values_t  = typename crsMat_t::values_type;
+  using Kokkos::HostSpace;
+  using Kokkos::MemoryTraits;
+  using Kokkos::Unmanaged;
+  // Create a small CRS matrix on host
+  std::vector<size_type> inRowmap = {0, 4, 4, 5, 7, 10};
+  std::vector<lno_t> inEntries    = {
+      4, 3, 5, 3,  // row 0
+                   // row 1 has no entries
+      6,           // row 2
+      2, 2,        // row 3
+      0, 1, 2      // row 4
+  };
+  // note: choosing values that can be represented exactly by float
+  std::vector<scalar_t> inValues = {
+      1.5, 4, 1, -3,  // row 0
+                      // row 1
+      2,              // row 2
+      -1, -2,         // row 3
+      0, 3.5, -2.25   // row 4
+  };
+  lno_t nrows   = 5;
+  lno_t ncols   = 7;
+  size_type nnz = inEntries.size();
+  Kokkos::View<size_type*, HostSpace, MemoryTraits<Unmanaged>> hostInRowmap(
+      inRowmap.data(), nrows + 1);
+  Kokkos::View<lno_t*, HostSpace, MemoryTraits<Unmanaged>> hostInEntries(
+      inEntries.data(), nnz);
+  Kokkos::View<scalar_t*, HostSpace, MemoryTraits<Unmanaged>> hostInValues(
+      inValues.data(), nnz);
+  rowmap_t devInRowmap("", nrows + 1);
+  entries_t devInEntries("", nnz);
+  values_t devInValues("", nnz);
+  Kokkos::deep_copy(devInRowmap, hostInRowmap);
+  Kokkos::deep_copy(devInEntries, hostInEntries);
+  Kokkos::deep_copy(devInValues, hostInValues);
+  crsMat_t input("Input", nrows, ncols, nnz, devInValues, devInRowmap,
+                 devInEntries);
+  crsMat_t output = KokkosSparse::sort_and_merge_matrix(input);
+  exec_space().fence();
+  EXPECT_EQ(output.numRows(), nrows);
+  EXPECT_EQ(output.numCols(), ncols);
+  auto outRowmap  = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                       output.graph.row_map);
+  auto outEntries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                        output.graph.entries);
+  auto outValues =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), output.values);
+  // Expect 2 merges to have taken place
+  std::vector<size_type> goldRowmap = {0, 3, 3, 4, 5, 8};
+  std::vector<lno_t> goldEntries    = {
+      3, 4, 5,  // row 0
+                // row 1 has no entries
+      6,        // row 2
+      2,        // row 3
+      0, 1, 2   // row 4
+  };
+  // note: choosing values that can be represented exactly by float
+  std::vector<scalar_t> goldValues = {
+      1, 1.5, 1,     // row 0
+                     // row 1
+      2,             // row 2
+      -3,            // row 3
+      0, 3.5, -2.25  // row 4
+  };
+  EXPECT_EQ(goldRowmap.size(), outRowmap.extent(0));
+  EXPECT_EQ(goldEntries.size(), outEntries.extent(0));
+  EXPECT_EQ(goldValues.size(), outValues.extent(0));
+  EXPECT_EQ(goldValues.size(), output.nnz());
+  for (lno_t i = 0; i < nrows + 1; i++) EXPECT_EQ(goldRowmap[i], outRowmap(i));
+  for (size_type i = 0; i < output.nnz(); i++) {
+    EXPECT_EQ(goldEntries[i], outEntries(i));
+    EXPECT_EQ(goldValues[i], outValues(i));
+  }
+}
+
+TEST_F(TestCategory, common_sort_crsgraph) {
+  for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) {
+    testSortCRS<TestExecSpace>(10, 10, 20, false, doStructInterface);
+    testSortCRS<TestExecSpace>(100, 100, 2000, false, doStructInterface);
+    testSortCRS<TestExecSpace>(1000, 1000, 30000, false, doStructInterface);
+    testSortCRSUnmanaged<TestExecSpace>(false, doStructInterface);
+  }
+}
+
+TEST_F(TestCategory, common_sort_crsmatrix) {
+  for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) {
+    testSortCRS<TestExecSpace>(10, 10, 20, true, doStructInterface);
+    testSortCRS<TestExecSpace>(100, 100, 2000, true, doStructInterface);
+    testSortCRS<TestExecSpace>(1000, 1000, 30000, true, doStructInterface);
+    testSortCRSUnmanaged<TestExecSpace>(true, doStructInterface);
+  }
+}
+
+TEST_F(TestCategory, common_sort_crs_longrows) {
+  testSortCRS<TestExecSpace>(1, 50000, 10000, false, false);
+  testSortCRS<TestExecSpace>(1, 50000, 10000, true, false);
+}
+
+TEST_F(TestCategory, common_sort_merge_crsmatrix) {
+  testSortAndMerge<TestExecSpace>();
+}
+
+#endif  // KOKKOSSPARSE_SORTCRSTEST_HPP
diff --git a/unit_test/sparse/Test_Sparse_TestUtils_RandCscMat.hpp b/unit_test/sparse/Test_Sparse_TestUtils_RandCscMat.hpp
new file mode 100644
index 0000000000..fc33f9f08b
--- /dev/null
+++ b/unit_test/sparse/Test_Sparse_TestUtils_RandCscMat.hpp
@@ -0,0 +1,105 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include "KokkosKernels_TestUtils.hpp"
+
+namespace Test {
+template <class ScalarType, class LayoutType, class ExeSpaceType>
+void doCscMat(size_t m, size_t n, ScalarType min_val, ScalarType max_val) {
+  auto expected_min    = ScalarType(1.0);
+  int64_t expected_nnz = 0;
+  RandCscMat<ScalarType, LayoutType, ExeSpaceType> cm(m, n, min_val, max_val);
+
+  for (int64_t i = 0; i < cm.get_nnz(); ++i)
+    ASSERT_GE(cm(i), expected_min) << cm.info;
+
+  for (int64_t j = 0; j < cm.get_n(); ++j) {
+    for (int64_t i = 0; i < cm.get_col_len(j); ++i)
+      ASSERT_FLOAT_EQ(cm(cm.get_col_start(j) + i), cm(expected_nnz + i))
+          << cm.info;
+    expected_nnz += cm.get_col_len(j);
+  }
+  ASSERT_EQ(cm.get_nnz(), expected_nnz) << cm.info;
+
+  // No need to check data here. Kokkos unit-tests deep_copy.
+  auto vals = cm.get_vals();
+  ASSERT_EQ(vals.extent(0), cm.get_nnz() + 1) << cm.info;
+
+  auto row_ids = cm.get_row_ids();
+  ASSERT_EQ(row_ids.extent(0), cm.get_n() * cm.get_m() + 1) << cm.info;
+
+  auto col_map = cm.get_col_map();
+  ASSERT_EQ(col_map.extent(0), cm.get_n() + 1);
+}
+
+template <class ExeSpaceType>
+void doAllCscMat(size_t m, size_t n) {
+  int min = 1, max = 10;
+
+  // Verify that CscMax is constructed properly.
+  doCscMat<float, Kokkos::LayoutLeft, ExeSpaceType>(m, n, min, max);
+  doCscMat<float, Kokkos::LayoutRight, ExeSpaceType>(m, n, min, max);
+
+  doCscMat<double, Kokkos::LayoutLeft, ExeSpaceType>(m, n, min, max);
+  doCscMat<double, Kokkos::LayoutRight, ExeSpaceType>(m, n, min, max);
+
+  // Verify that CscMax can be instantiated with complex types.
+  RandCscMat<Kokkos::complex<float>, Kokkos::LayoutLeft, ExeSpaceType> cmcf(
+      m, n, min, max);
+  RandCscMat<Kokkos::complex<double>, Kokkos::LayoutRight, ExeSpaceType> cmcd(
+      m, n, min, max);
+}
+
+// Test randomly generated csc matrices
+TEST_F(TestCategory, sparse_randcscmat) {
+  // Square cases
+  for (int dim = 1; dim < 1024; dim *= 4) doAllCscMat<TestExecSpace>(dim, dim);
+
+  // Non-square cases
+  for (int dim = 1; dim < 1024; dim *= 4) {
+    doAllCscMat<TestExecSpace>(dim * 3, dim);
+    doAllCscMat<TestExecSpace>(dim, dim * 3);
+  }
+}
+}  // namespace Test
\ No newline at end of file
diff --git a/unit_test/sparse/Test_Sparse_Transpose.hpp b/unit_test/sparse/Test_Sparse_Transpose.hpp
new file mode 100644
index 0000000000..77868a7251
--- /dev/null
+++ b/unit_test/sparse/Test_Sparse_Transpose.hpp
@@ -0,0 +1,357 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Test_Common_Transpose.hpp
+
+#ifndef KOKKOSKERNELS_TRANSPOSE_HPP
+#define KOKKOSKERNELS_TRANSPOSE_HPP
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Sort.hpp>
+#include <KokkosSparse_Utils.hpp>
+#include <KokkosKernels_IOUtils.hpp>
+#include <KokkosSparse_IOUtils.hpp>
+#include <KokkosKernels_default_types.hpp>
+#include <KokkosSparse_CrsMatrix.hpp>
+#include <KokkosSparse_SortCrs.hpp>
+
+template <typename size_type, typename V>
+struct ExactCompare {
+  ExactCompare(const V& v1_, const V& v2_) : v1(v1_), v2(v2_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(size_type i, size_type& ldiffs) const {
+    if (v1(i) != v2(i)) ldiffs++;
+  }
+
+  V v1;
+  V v2;
+};
+
+template <typename exec_space>
+void testTranspose(int numRows, int numCols, bool doValues) {
+  using range_pol  = Kokkos::RangePolicy<exec_space>;
+  using scalar_t   = default_scalar;
+  using lno_t      = default_lno_t;
+  using size_type  = default_size_type;
+  using mem_space  = typename exec_space::memory_space;
+  using device_t   = Kokkos::Device<exec_space, mem_space>;
+  using crsMat_t   = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t,
+                                                    void, size_type>;
+  using c_rowmap_t = typename crsMat_t::row_map_type;
+  using c_entries_t = typename crsMat_t::index_type;
+  using c_values_t  = typename crsMat_t::values_type;
+  using rowmap_t    = typename crsMat_t::row_map_type::non_const_type;
+  using entries_t   = typename crsMat_t::index_type::non_const_type;
+  using values_t    = typename crsMat_t::values_type::non_const_type;
+  size_type nnz     = 10 * numRows;
+  // Generate a matrix that has 0 entries in some rows
+  crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat_t>(
+      numRows, numCols, nnz, 3 * 10, numRows / 2);
+  // compute the transpose while unsorted, then transpose again
+  rowmap_t t_rowmap("Rowmap^T", numCols + 1);  // this view is initialized to 0
+  entries_t t_entries(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries^T"),
+      input_mat.graph.entries.extent(0));
+  values_t t_values(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values^T"),
+                    input_mat.values.extent(0));
+  rowmap_t tt_rowmap("Rowmap^T^T",
+                     numRows + 1);  // this view is initialized to 0
+  entries_t tt_entries(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries^T^T"),
+      input_mat.graph.entries.extent(0));
+  values_t tt_values(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values^T"),
+      input_mat.values.extent(0));
+  if (doValues) {
+    KokkosSparse::Impl::transpose_matrix<c_rowmap_t, c_entries_t, c_values_t,
+                                         rowmap_t, entries_t, values_t,
+                                         rowmap_t, exec_space>(
+        numRows, numCols, input_mat.graph.row_map, input_mat.graph.entries,
+        input_mat.values, t_rowmap, t_entries, t_values);
+    KokkosSparse::Impl::transpose_matrix<rowmap_t, entries_t, values_t,
+                                         rowmap_t, entries_t, values_t,
+                                         rowmap_t, exec_space>(
+        numCols, numRows, t_rowmap, t_entries, t_values, tt_rowmap, tt_entries,
+        tt_values);
+  } else {
+    KokkosSparse::Impl::transpose_graph<c_rowmap_t, c_entries_t, rowmap_t,
+                                        entries_t, rowmap_t, exec_space>(
+        numRows, numCols, input_mat.graph.row_map, input_mat.graph.entries,
+        t_rowmap, t_entries);
+    KokkosSparse::Impl::transpose_graph<rowmap_t, entries_t, rowmap_t,
+                                        entries_t, rowmap_t, exec_space>(
+        numCols, numRows, t_rowmap, t_entries, tt_rowmap, tt_entries);
+  }
+  // Sort both the transpose-transpose, and the original matrix (to compare
+  // directly)
+  KokkosSparse::sort_crs_matrix(input_mat);
+  KokkosSparse::sort_crs_matrix<exec_space, c_rowmap_t, entries_t, values_t>(
+      tt_rowmap, tt_entries, tt_values);
+  // The views should now be exactly identical, since they represent the same
+  // matrix and are sorted
+  size_type rowmapDiffs;
+  Kokkos::parallel_reduce(
+      range_pol(0, numRows + 1),
+      ExactCompare<size_type, c_rowmap_t>(input_mat.graph.row_map, tt_rowmap),
+      rowmapDiffs);
+  size_type entriesDiffs;
+  Kokkos::parallel_reduce(
+      range_pol(0, input_mat.nnz()),
+      ExactCompare<size_type, c_entries_t>(input_mat.graph.entries, tt_entries),
+      entriesDiffs);
+  EXPECT_EQ(size_type(0), rowmapDiffs);
+  EXPECT_EQ(size_type(0), entriesDiffs);
+  if (doValues) {
+    size_type valuesDiffs;
+    Kokkos::parallel_reduce(
+        range_pol(0, input_mat.nnz()),
+        ExactCompare<size_type, values_t>(input_mat.values, tt_values),
+        valuesDiffs);
+    EXPECT_EQ(size_type(0), valuesDiffs);
+  }
+}
+
+template <class bsrMat_t>
+void CompareBsrMatrices(bsrMat_t& A, bsrMat_t& B) {
+  using exec_space  = typename bsrMat_t::execution_space;
+  using range_pol   = Kokkos::RangePolicy<exec_space>;
+  using size_type   = default_size_type;
+  using c_rowmap_t  = typename bsrMat_t::row_map_type;
+  using c_entries_t = typename bsrMat_t::index_type;
+  using values_t    = typename bsrMat_t::values_type::non_const_type;
+
+  // The views should now be exactly identical, since they represent the same
+  // matrix and are sorted
+
+  size_type rowmapDiffs;
+  Kokkos::parallel_reduce(
+      range_pol(0, A.numRows() + 1),
+      ExactCompare<size_type, c_rowmap_t>(A.graph.row_map, B.graph.row_map),
+      rowmapDiffs);
+
+  size_type entriesDiffs;
+  Kokkos::parallel_reduce(
+      range_pol(0, A.nnz()),
+      ExactCompare<size_type, c_entries_t>(A.graph.entries, B.graph.entries),
+      entriesDiffs);
+
+  EXPECT_EQ(size_type(0), rowmapDiffs);
+  EXPECT_EQ(size_type(0), entriesDiffs);
+
+  size_type valuesDiffs;
+  Kokkos::parallel_reduce(range_pol(0, A.nnz() * A.blockDim() * A.blockDim()),
+                          ExactCompare<size_type, values_t>(A.values, B.values),
+                          valuesDiffs);
+  EXPECT_EQ(size_type(0), valuesDiffs);
+}
+
+template <typename exec_space>
+void testTransposeBsrRef() {
+  using scalar_t  = default_scalar;
+  using lno_t     = default_lno_t;
+  using size_type = default_size_type;
+  using mem_space = typename exec_space::memory_space;
+  using device_t  = Kokkos::Device<exec_space, mem_space>;
+  using bsrMat_t =
+      typename KokkosSparse::Experimental::BsrMatrix<scalar_t, lno_t, device_t,
+                                                     void, size_type>;
+  using rowmap_t  = typename bsrMat_t::row_map_type::non_const_type;
+  using entries_t = typename bsrMat_t::index_type::non_const_type;
+  using values_t  = typename bsrMat_t::values_type::non_const_type;
+
+  const int numRows    = 4;
+  const int nnz        = 7;
+  const int block_size = 2;
+
+  // Coming up with a BsrMatrix
+  bsrMat_t A;
+  {
+    rowmap_t row_map("row map", numRows + 1);
+    entries_t entries("entries", nnz);
+    values_t values("values", nnz * block_size * block_size);
+
+    const size_type row_mapPtr[] = {0, 2, 3, 5, 7};
+    const lno_t entriesPtr[]     = {2, 3, 1, 0, 1, 1, 3};
+    const scalar_t valuesPtr[]   = {
+        0.0, 0.1, 0.2, 0.3, 1.0, 1.1, 1.2, 1.3, 2.0, 2.1, 2.2, 2.3, 3.0, 3.1,
+        3.2, 3.3, 4.0, 4.1, 4.2, 4.3, 5.0, 5.1, 5.2, 5.3, 6.0, 6.1, 6.2, 6.3};
+
+    typename rowmap_t::HostMirror::const_type row_map_h(row_mapPtr,
+                                                        numRows + 1);
+    typename entries_t::HostMirror::const_type entries_h(entriesPtr, nnz);
+    typename values_t::HostMirror::const_type values_h(
+        valuesPtr, nnz * block_size * block_size);
+
+    Kokkos::deep_copy(row_map, row_map_h);
+    Kokkos::deep_copy(entries, entries_h);
+    Kokkos::deep_copy(values, values_h);
+
+    A = bsrMat_t("A", numRows, numRows, nnz, values, row_map, entries,
+                 block_size);
+  }
+
+  // Constructing the transpose of A manually
+  bsrMat_t At_ref;
+  {
+    rowmap_t row_map("row map", numRows + 1);
+    entries_t entries("entries", nnz);
+    values_t values("values", nnz * block_size * block_size);
+
+    const size_type row_mapPtr[] = {0, 1, 4, 5, 7};
+    const lno_t entriesPtr[]     = {2, 1, 2, 3, 0, 0, 3};
+    const scalar_t valuesPtr[]   = {
+        3.0, 3.2, 3.1, 3.3, 2.0, 2.2, 2.1, 2.3, 4.0, 4.2, 4.1, 4.3, 5.0, 5.2,
+        5.1, 5.3, 0.0, 0.2, 0.1, 0.3, 1.0, 1.2, 1.1, 1.3, 6.0, 6.2, 6.1, 6.3};
+
+    typename rowmap_t::HostMirror::const_type row_map_h(row_mapPtr,
+                                                        numRows + 1);
+    typename entries_t::HostMirror::const_type entries_h(entriesPtr, nnz);
+    typename values_t::HostMirror::const_type values_h(
+        valuesPtr, nnz * block_size * block_size);
+
+    Kokkos::deep_copy(row_map, row_map_h);
+    Kokkos::deep_copy(entries, entries_h);
+    Kokkos::deep_copy(values, values_h);
+
+    At_ref = bsrMat_t("A", numRows, numRows, nnz, values, row_map, entries,
+                      block_size);
+  }
+
+  bsrMat_t At = KokkosSparse::Impl::transpose_bsr_matrix(A);
+  KokkosSparse::sort_bsr_matrix(At);
+
+  CompareBsrMatrices(At, At_ref);
+}
+
+template <typename exec_space>
+void testTransposeBsr(int numRows, int numCols, int blockSize) {
+  using scalar_t  = default_scalar;
+  using lno_t     = default_lno_t;
+  using size_type = default_size_type;
+  using mem_space = typename exec_space::memory_space;
+  using device_t  = Kokkos::Device<exec_space, mem_space>;
+  using bsrMat_t =
+      typename KokkosSparse::Experimental::BsrMatrix<scalar_t, lno_t, device_t,
+                                                     void, size_type>;
+  using c_rowmap_t  = typename bsrMat_t::row_map_type;
+  using c_entries_t = typename bsrMat_t::index_type;
+  using c_values_t  = typename bsrMat_t::values_type;
+  using rowmap_t    = typename bsrMat_t::row_map_type::non_const_type;
+  using entries_t   = typename bsrMat_t::index_type::non_const_type;
+  using values_t    = typename bsrMat_t::values_type::non_const_type;
+
+  // Generate a matrix that has 0 entries in some rows
+  size_type nnz = 10 * numRows;
+  bsrMat_t A    = KokkosSparse::Impl::kk_generate_sparse_matrix<bsrMat_t>(
+      blockSize, numRows, numCols, nnz, 3, numRows / 4);
+
+  // compute the transpose while unsorted, then transpose again
+  rowmap_t t_rowmap("Rowmap^T", numCols + 1);  // this view is initialized to 0
+  entries_t t_entries(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries^T"),
+      A.graph.entries.extent(0));
+  values_t t_values(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values^T"),
+                    A.values.extent(0));
+  rowmap_t tt_rowmap("Rowmap^T^T",
+                     numRows + 1);  // this view is initialized to 0
+  entries_t tt_entries(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries^T^T"),
+      A.graph.entries.extent(0));
+  values_t tt_values(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values^T"),
+      A.values.extent(0));
+
+  KokkosSparse::Impl::transpose_bsr_matrix<c_rowmap_t, c_entries_t, c_values_t,
+                                           rowmap_t, entries_t, values_t,
+                                           exec_space>(
+      numRows, numCols, blockSize, A.graph.row_map, A.graph.entries, A.values,
+      t_rowmap, t_entries, t_values);
+
+  KokkosSparse::Impl::transpose_bsr_matrix<
+      rowmap_t, entries_t, values_t, rowmap_t, entries_t, values_t, exec_space>(
+      numCols, numRows, blockSize, t_rowmap, t_entries, t_values, tt_rowmap,
+      tt_entries, tt_values);
+  bsrMat_t Att("Att", numRows, numCols, nnz, tt_values, tt_rowmap, tt_entries,
+               blockSize);
+
+  // Sort both the transpose-transpose, and the original matrix (to compare
+  // directly)
+  KokkosSparse::sort_bsr_matrix(A);
+
+  KokkosSparse::sort_bsr_matrix(Att);
+
+  CompareBsrMatrices(A, Att);
+}
+
+TEST_F(TestCategory, sparse_transpose_matrix) {
+  // Test both matrix and graph transpose with various sizes
+  testTranspose<TestExecSpace>(100, 100, true);
+  testTranspose<TestExecSpace>(500, 50, true);
+  testTranspose<TestExecSpace>(50, 500, true);
+  testTranspose<TestExecSpace>(4000, 2000, true);
+  testTranspose<TestExecSpace>(2000, 4000, true);
+  testTranspose<TestExecSpace>(2000, 2000, true);
+}
+
+TEST_F(TestCategory, sparse_transpose_graph) {
+  testTranspose<TestExecSpace>(100, 100, false);
+  testTranspose<TestExecSpace>(500, 50, false);
+  testTranspose<TestExecSpace>(50, 500, false);
+  testTranspose<TestExecSpace>(4000, 2000, false);
+  testTranspose<TestExecSpace>(2000, 4000, false);
+  testTranspose<TestExecSpace>(2000, 2000, false);
+}
+
+TEST_F(TestCategory, sparse_transpose_bsr_matrix) {
+  testTransposeBsrRef<TestExecSpace>();
+  // Test bsrMatrix transpose with various sizes
+  testTransposeBsr<TestExecSpace>(100, 100, 3);
+  testTransposeBsr<TestExecSpace>(500, 50, 5);
+  testTransposeBsr<TestExecSpace>(50, 500, 16);
+  testTransposeBsr<TestExecSpace>(4000, 2000, 3);
+  testTransposeBsr<TestExecSpace>(2000, 4000, 3);
+  testTransposeBsr<TestExecSpace>(2000, 2000, 5);
+}
+
+#endif
diff --git a/unit_test/sparse/Test_Sparse_Utils_cusparse.hpp b/unit_test/sparse/Test_Sparse_Utils_cusparse.hpp
index 3d85ec394a..0ad16c54d0 100644
--- a/unit_test/sparse/Test_Sparse_Utils_cusparse.hpp
+++ b/unit_test/sparse/Test_Sparse_Utils_cusparse.hpp
@@ -7,7 +7,7 @@
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
 
-#include "KokkosKernels_SparseUtils_cusparse.hpp"
+#include "KokkosSparse_Utils_cusparse.hpp"
 
 void test_cusparse_safe_call() {
   bool caught_exception = false;
diff --git a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp
index d76f6be812..51e0899529 100644
--- a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp
+++ b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp
@@ -48,7 +48,8 @@
 #include "KokkosKernels_TestUtils.hpp"
 #include "KokkosKernels_Handle.hpp"
 #include "KokkosKernels_IOUtils.hpp"
-#include "KokkosKernels_SparseUtils.hpp"
+#include "KokkosSparse_IOUtils.hpp"
+#include "KokkosSparse_Utils.hpp"
 #include <KokkosSparse_spmv.hpp>
 #include <KokkosBlas1_dot.hpp>
 #include <KokkosBlas1_axpby.hpp>
@@ -58,19 +59,10 @@
 #include <complex>
 #include "KokkosSparse_gauss_seidel.hpp"
 
-// #ifndef kokkos_complex_double
-// #define kokkos_complex_double Kokkos::complex<double>
-// #define kokkos_complex_float Kokkos::complex<float>
-// #endif
+using kokkos_complex_double = Kokkos::complex<double>;
+using kokkos_complex_float  = Kokkos::complex<float>;
 
-typedef Kokkos::complex<double> kokkos_complex_double;
-typedef Kokkos::complex<float> kokkos_complex_float;
-
-using namespace KokkosKernels;
-using namespace KokkosKernels::Impl;
-using namespace KokkosKernels::Experimental;
-using namespace KokkosSparse;
-using namespace KokkosSparse::Experimental;
+namespace KSExp = KokkosSparse::Experimental;
 
 namespace Test {
 
@@ -90,8 +82,9 @@ struct GSTestParams {
 
   // Note: GS_DEFAULT is same as GS_TEAM and - for blocks - as GS_PERMUTED
   // Note: GS_TWOSTAGE and GS_CLUSTER are not supported for blocks
-  std::vector<GSAlgorithm> gs_algorithms = {GS_DEFAULT};
-  std::vector<size_t> shmem_sizes        = {
+  std::vector<KokkosSparse::GSAlgorithm> gs_algorithms = {
+      KokkosSparse::GS_DEFAULT};
+  std::vector<size_t> shmem_sizes = {
       32128,
       2008  // make the shmem small on gpus so that it will test 2 level
             // algorithm.
@@ -119,12 +112,11 @@ int run_block_gauss_seidel_1(
   typedef typename lno_nnz_view_t::value_type lno_t;
   typedef typename scalar_view_t::value_type scalar_t;
 
-  constexpr auto format = MatrixTraits<mtx_t>::format;
+  constexpr auto format = KokkosSparse::Impl::MatrixTraits<mtx_t>::format;
 
-  typedef KokkosKernelsHandle<
+  using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
       size_type, lno_t, scalar_t, typename mtx_t::execution_space,
-      typename mtx_t::memory_space, typename mtx_t::memory_space>
-      KernelHandle;
+      typename mtx_t::memory_space, typename mtx_t::memory_space>;
   KernelHandle kh;
   kh.set_team_work_size(16);
   kh.set_shmem_size(shmem_size);
@@ -136,33 +128,33 @@ int run_block_gauss_seidel_1(
   const int apply_count   = 100;
 
   if (!skip_symbolic) {
-    block_gauss_seidel_symbolic(&kh, num_rows_1, num_cols_1, block_size,
-                                input_mat.graph.row_map,
-                                input_mat.graph.entries, is_symmetric_graph);
+    KSExp::block_gauss_seidel_symbolic(
+        &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map,
+        input_mat.graph.entries, is_symmetric_graph);
   }
 
   if (!skip_numeric) {
-    block_gauss_seidel_numeric<format>(
+    KSExp::block_gauss_seidel_numeric<format>(
         &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map,
         input_mat.graph.entries, input_mat.values, is_symmetric_graph);
   }
 
   switch (apply_type) {
     case Test::forward_sweep:
-      forward_sweep_block_gauss_seidel_apply<format>(
+      KSExp::forward_sweep_block_gauss_seidel_apply<format>(
           &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map,
           input_mat.graph.entries, input_mat.values, x_vector, y_vector, false,
           true, omega, apply_count);
       break;
     case Test::backward_sweep:
-      backward_sweep_block_gauss_seidel_apply<format>(
+      KSExp::backward_sweep_block_gauss_seidel_apply<format>(
           &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map,
           input_mat.graph.entries, input_mat.values, x_vector, y_vector, false,
           true, omega, apply_count);
       break;
     case Test::symmetric:
     default:
-      symmetric_block_gauss_seidel_apply<format>(
+      KSExp::symmetric_block_gauss_seidel_apply<format>(
           &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map,
           input_mat.graph.entries, input_mat.values, x_vector, y_vector, false,
           true, omega, apply_count);
@@ -175,15 +167,15 @@ int run_block_gauss_seidel_1(
 
 }  // namespace Test
 
-template <SparseMatrixFormat mtx_format, typename scalar_t, typename lno_t,
-          typename size_type, typename device>
+template <KokkosSparse::SparseMatrixFormat mtx_format, typename scalar_t,
+          typename lno_t, typename size_type, typename device>
 void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz,
                                    lno_t bandwidth, lno_t row_size_variance) {
   using namespace Test;
   srand(245);
-  typedef
-      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
-          crsMat_t;
+  using crsMat_t = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device,
+                                                    void, size_type>;
+  using MatrixConverter = KokkosSparse::Impl::MatrixConverter<mtx_format>;
 
   typedef typename device::execution_space exec_space;
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
@@ -200,7 +192,7 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz,
   lno_t block_size = params.block_size;
 
   crsMat_t crsmat =
-      KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+      KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix<
           crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth);
 
   lno_view_t pf_rm;
@@ -211,16 +203,15 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz,
   // this makes consecutive 5 rows to have same columns.
   // it will add scalar 0's for those entries that does not exists.
   // the result is still a point crs matrix.
-  KokkosKernels::Impl::kk_create_blockcrs_formated_point_crsmatrix(
+  KokkosSparse::Impl::kk_create_blockcrs_formatted_point_crsmatrix(
       block_size, crsmat.numRows(), crsmat.numCols(), crsmat.graph.row_map,
       crsmat.graph.entries, crsmat.values, out_r, out_c, pf_rm, pf_e, pf_v);
   graph_t static_graph2(pf_e, pf_rm);
   crsMat_t crsmat2("CrsMatrix2", out_c, pf_v, static_graph2);
 
   // this converts the previous generated matrix to block matrix.
-  auto input_mat =
-      MatrixConverter<mtx_format>::from_blockcrs_formated_point_crsmatrix(
-          crsmat2, block_size);
+  auto input_mat = MatrixConverter::from_blockcrs_formatted_point_crsmatrix(
+      crsmat2, block_size);
 
   lno_t nv = ((crsmat2.numRows() + block_size - 1) / block_size) * block_size;
 
@@ -262,15 +253,15 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz,
   // device::execution_space::finalize();
 }
 
-template <SparseMatrixFormat mtx_format, typename scalar_t, typename lno_t,
-          typename size_type, typename device>
+template <KokkosSparse::SparseMatrixFormat mtx_format, typename scalar_t,
+          typename lno_t, typename size_type, typename device>
 void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz,
                                    lno_t bandwidth, lno_t row_size_variance) {
   using namespace Test;
   srand(245);
-  typedef
-      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
-          crsMat_t;
+  using crsMat_t = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device,
+                                                    void, size_type>;
+  using MatrixConverter = KokkosSparse::Impl::MatrixConverter<mtx_format>;
 
   typedef typename device::execution_space exec_space;
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
@@ -288,7 +279,7 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz,
   lno_t block_size = params.block_size;
 
   crsMat_t crsmat =
-      KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+      KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix<
           crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth);
 
   lno_view_t pf_rm;
@@ -299,15 +290,14 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz,
   // this makes consecutive 5 rows to have same columns.
   // it will add scalar 0's for those entries that does not exists.
   // the result is still a point crs matrix.
-  KokkosKernels::Impl::kk_create_blockcrs_formated_point_crsmatrix(
+  KokkosSparse::Impl::kk_create_blockcrs_formatted_point_crsmatrix(
       block_size, crsmat.numRows(), crsmat.numCols(), crsmat.graph.row_map,
       crsmat.graph.entries, crsmat.values, out_r, out_c, pf_rm, pf_e, pf_v);
   graph_t static_graph2(pf_e, pf_rm);
   crsMat_t crsmat2("CrsMatrix2", out_c, pf_v, static_graph2);
 
-  auto input_mat =
-      MatrixConverter<mtx_format>::from_blockcrs_formated_point_crsmatrix(
-          crsmat2, block_size);
+  auto input_mat = MatrixConverter::from_blockcrs_formatted_point_crsmatrix(
+      crsmat2, block_size);
 
   lno_t nv = ((crsmat2.numRows() + block_size - 1) / block_size) * block_size;
 
@@ -372,8 +362,8 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz,
   // device::execution_space::finalize();
 }
 
-template <SparseMatrixFormat mtx_format, typename scalar_t, typename lno_t,
-          typename size_type, typename device>
+template <KokkosSparse::SparseMatrixFormat mtx_format, typename scalar_t,
+          typename lno_t, typename size_type, typename device>
 void test_block_gauss_seidel_empty() {
   using namespace Test;
   typedef
@@ -383,10 +373,9 @@ void test_block_gauss_seidel_empty() {
   typedef typename graph_t::row_map_type::non_const_type row_map_type;
   typedef typename graph_t::entries_type::non_const_type entries_type;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-  typedef KokkosKernelsHandle<
+  using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
       size_type, lno_t, scalar_t, typename device::execution_space,
-      typename device::memory_space, typename device::memory_space>
-      KernelHandle;
+      typename device::memory_space, typename device::memory_space>;
   // The rowmap of a zero-row matrix can be length 0 or 1, so Gauss-Seidel
   // should work with both (the setup and apply are essentially no-ops but they
   // shouldn't crash or throw exceptions) For this test, create size-0 and
@@ -394,7 +383,7 @@ void test_block_gauss_seidel_empty() {
   // which can trigger different bugs.
   for (const int rowmapLen : {0, 1, 5}) {
     KernelHandle kh;
-    kh.create_gs_handle(GS_DEFAULT);
+    kh.create_gs_handle(KokkosSparse::GS_DEFAULT);
     const auto num_rows    = KOKKOSKERNELS_MACRO_MAX(0, rowmapLen - 1);
     const lno_t block_size = 1;  // irrelevant (no values here)
     // initialized to 0
@@ -402,183 +391,58 @@ void test_block_gauss_seidel_empty() {
     entries_type entries("Entries", 0);
     scalar_view_t values("Values", 0);
     // also, make sure graph symmetrization doesn't crash on zero rows
-    block_gauss_seidel_symbolic(&kh, num_rows, num_rows, block_size, rowmap,
-                                entries, false);
-    block_gauss_seidel_numeric<mtx_format>(&kh, num_rows, num_rows, block_size,
-                                           rowmap, entries, values, false);
+    KSExp::block_gauss_seidel_symbolic(&kh, num_rows, num_rows, block_size,
+                                       rowmap, entries, false);
+    KSExp::block_gauss_seidel_numeric<mtx_format>(
+        &kh, num_rows, num_rows, block_size, rowmap, entries, values, false);
     scalar_view_t x("X", num_rows);
     scalar_view_t y("Y", num_rows);
     scalar_t omega(0.9);
-    symmetric_block_gauss_seidel_apply<mtx_format>(
+    KSExp::symmetric_block_gauss_seidel_apply<mtx_format>(
         &kh, num_rows, num_rows, block_size, rowmap, entries, values, x, y,
         false, true, omega, 3);
     kh.destroy_gs_handle();
   }
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                                    \
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                      \
   TEST_F(                                                                                \
       TestCategory,                                                                      \
       sparse_blockcrs_gauss_seidel_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
-    test_block_gauss_seidel_rank1<BlockCRS, SCALAR, ORDINAL, OFFSET, DEVICE>(            \
-        500, 500 * 10, 70, 3);                                                           \
+    test_block_gauss_seidel_rank1<KokkosSparse::BlockCRS, SCALAR, ORDINAL,               \
+                                  OFFSET, DEVICE>(500, 500 * 10, 70, 3);                 \
   }                                                                                      \
   TEST_F(                                                                                \
       TestCategory,                                                                      \
       sparse_blockcrs_gauss_seidel_rank2_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {   \
-    test_block_gauss_seidel_rank2<BlockCRS, SCALAR, ORDINAL, OFFSET, DEVICE>(            \
-        500, 500 * 10, 70, 3);                                                           \
+    test_block_gauss_seidel_rank2<KokkosSparse::BlockCRS, SCALAR, ORDINAL,               \
+                                  OFFSET, DEVICE>(500, 500 * 10, 70, 3);                 \
   }                                                                                      \
   TEST_F(                                                                                \
       TestCategory,                                                                      \
       sparse_blockcrs_gauss_seidel_empty_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {   \
-    test_block_gauss_seidel_empty<BlockCRS, SCALAR, ORDINAL, OFFSET,                     \
-                                  DEVICE>();                                             \
+    test_block_gauss_seidel_empty<KokkosSparse::BlockCRS, SCALAR, ORDINAL,               \
+                                  OFFSET, DEVICE>();                                     \
   }                                                                                      \
   TEST_F(                                                                                \
       TestCategory,                                                                      \
       sparse_bsr_gauss_seidel_rank1_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {        \
-    test_block_gauss_seidel_rank1<BSR, SCALAR, ORDINAL, OFFSET, DEVICE>(                 \
-        500, 500 * 10, 70, 3);                                                           \
+    test_block_gauss_seidel_rank1<KokkosSparse::BSR, SCALAR, ORDINAL, OFFSET,            \
+                                  DEVICE>(500, 500 * 10, 70, 3);                         \
   }                                                                                      \
   TEST_F(                                                                                \
       TestCategory,                                                                      \
       sparse_bsr_gauss_seidel_rank2_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {        \
-    test_block_gauss_seidel_rank2<BSR, SCALAR, ORDINAL, OFFSET, DEVICE>(                 \
-        500, 500 * 10, 70, 3);                                                           \
+    test_block_gauss_seidel_rank2<KokkosSparse::BSR, SCALAR, ORDINAL, OFFSET,            \
+                                  DEVICE>(500, 500 * 10, 70, 3);                         \
   }                                                                                      \
   TEST_F(                                                                                \
       TestCategory,                                                                      \
       sparse_bsr_gauss_seidel_empty_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {        \
-    test_block_gauss_seidel_empty<BSR, SCALAR, ORDINAL, OFFSET, DEVICE>();               \
+    test_block_gauss_seidel_empty<KokkosSparse::BSR, SCALAR, ORDINAL, OFFSET,            \
+                                  DEVICE>();                                             \
   }
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
-#endif
-
-#undef EXECUTE_TEST
+#include <Test_Common_Test_All_Type_Combos.hpp>
+
+#undef KOKKOSKERNELS_EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_bspgemm.hpp b/unit_test/sparse/Test_Sparse_bspgemm.hpp
new file mode 100644
index 0000000000..7374ac6a78
--- /dev/null
+++ b/unit_test/sparse/Test_Sparse_bspgemm.hpp
@@ -0,0 +1,318 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+
+#include "KokkosSparse_Utils.hpp"
+#include "KokkosSparse_SortCrs.hpp"
+#include "KokkosSparse_spgemm.hpp"
+#include "KokkosSparse_BsrMatrix.hpp"
+#include "KokkosSparse_IOUtils.hpp"
+
+using namespace KokkosSparse;
+
+namespace Test {
+
+template <typename bsrMat_t>
+int run_block_spgemm(const bsrMat_t A, const bsrMat_t B, bsrMat_t &C,
+                     // parameters
+                     KokkosSparse::SPGEMMAlgorithm spgemm_algorithm,
+                     bool use_dynamic_scheduling = true,
+                     size_t shmem_size           = 0) {
+  typedef typename bsrMat_t::size_type size_type;
+  typedef typename bsrMat_t::ordinal_type lno_t;
+  typedef typename bsrMat_t::value_type scalar_t;
+  typedef typename bsrMat_t::device_type device;
+  typedef typename bsrMat_t::memory_space memory_space;
+
+  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, typename device::execution_space,
+      memory_space, memory_space>
+      KernelHandle;
+
+  KernelHandle kh;
+  kh.set_team_work_size(16);
+  kh.set_dynamic_scheduling(use_dynamic_scheduling);
+
+  kh.create_spgemm_handle(spgemm_algorithm);
+
+  if (shmem_size > 0) {
+    kh.set_shmem_size(shmem_size);
+  }
+  KokkosSparse::block_spgemm_symbolic(kh, A, false, B, false, C);
+  KokkosSparse::block_spgemm_numeric(kh, A, false, B, false, C);
+  kh.destroy_spgemm_handle();
+
+  return 0;
+}
+
+template <typename bsrMat_t>
+bool is_same_block_matrix(bsrMat_t output_mat_actual,
+                          bsrMat_t output_mat_reference) {
+  using device         = typename bsrMat_t::device_type;
+  using graph_t        = typename bsrMat_t::StaticCrsGraphType;
+  using lno_view_t     = typename graph_t::row_map_type::non_const_type;
+  using lno_nnz_view_t = typename graph_t::entries_type::non_const_type;
+  using scalar_view_t  = typename bsrMat_t::values_type::non_const_type;
+
+  size_t nrows_actual    = output_mat_actual.numRows();
+  size_t nentries_actual = output_mat_actual.graph.entries.extent(0);
+  size_t nvals_actual    = output_mat_actual.values.extent(0);
+
+  size_t nrows_reference    = output_mat_reference.numRows();
+  size_t nentries_reference = output_mat_reference.graph.entries.extent(0);
+  size_t nvals_reference    = output_mat_reference.values.extent(0);
+
+  if (nrows_actual != nrows_reference) {
+    std::cout << "nrows_actual:" << nrows_actual
+              << " nrows_reference:" << nrows_reference << std::endl;
+    return false;
+  }
+  if (nentries_actual != nentries_reference) {
+    std::cout << "nentries_actual:" << nentries_actual
+              << " nentries_reference:" << nentries_reference << std::endl;
+    return false;
+  }
+  if (nvals_actual != nvals_reference) {
+    std::cout << "nvals_actual:" << nvals_actual
+              << " nvals_reference:" << nvals_reference << std::endl;
+    return false;
+  }
+
+  KokkosSparse::sort_bsr_matrix(output_mat_actual);
+  KokkosSparse::sort_bsr_matrix(output_mat_reference);
+
+  bool is_identical = true;
+  is_identical      = KokkosKernels::Impl::kk_is_identical_view<
+      typename graph_t::row_map_type, typename graph_t::row_map_type,
+      typename lno_view_t::value_type, typename device::execution_space>(
+      output_mat_actual.graph.row_map, output_mat_reference.graph.row_map, 0);
+
+  if (!is_identical) {
+    std::cout << "rowmaps are different." << std::endl;
+    std::cout << "Actual rowmap:\n";
+    KokkosKernels::Impl::kk_print_1Dview(output_mat_actual.graph.row_map);
+    std::cout << "Correct rowmap (SPGEMM_DEBUG):\n";
+    KokkosKernels::Impl::kk_print_1Dview(output_mat_reference.graph.row_map);
+    return false;
+  }
+
+  is_identical = KokkosKernels::Impl::kk_is_identical_view<
+      lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type,
+      typename device::execution_space>(output_mat_actual.graph.entries,
+                                        output_mat_reference.graph.entries, 0);
+
+  if (!is_identical) {
+    std::cout << "entries are different." << std::endl;
+    KokkosKernels::Impl::kk_print_1Dview(output_mat_actual.graph.entries);
+    KokkosKernels::Impl::kk_print_1Dview(output_mat_reference.graph.entries);
+    return false;
+  }
+
+  typedef typename Kokkos::Details::ArithTraits<
+      typename scalar_view_t::non_const_value_type>::mag_type eps_type;
+  eps_type eps = std::is_same<eps_type, float>::value ? 3.7e-3 : 1e-7;
+
+  is_identical = KokkosKernels::Impl::kk_is_relatively_identical_view<
+      scalar_view_t, scalar_view_t, eps_type, typename device::execution_space>(
+      output_mat_actual.values, output_mat_reference.values, eps);
+
+  if (!is_identical) {
+    std::cout << "values are different." << std::endl;
+    KokkosKernels::Impl::kk_print_1Dview(output_mat_actual.values);
+    KokkosKernels::Impl::kk_print_1Dview(output_mat_reference.values);
+
+    return false;
+  }
+  return true;
+}
+}  // namespace Test
+
+// Generate matrices and test all supported spgemm algorithms.
+// C := AB, where A is m*k, B is k*n, and C is m*n.
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void test_bspgemm(lno_t blkDim, lno_t m, lno_t k, lno_t n, size_type nnz,
+                  lno_t bandwidth, lno_t row_size_variance,
+                  const bool use_dynamic_scheduling = true,
+                  const size_t shared_memory_size   = 0) {
+  using namespace Test;
+  // device::execution_space::initialize();
+  // device::execution_space::print_configuration(std::cout);
+
+  using bsrMat_t =
+      KokkosSparse::Experimental::BsrMatrix<scalar_t, lno_t, device, void,
+                                            size_type>;
+
+  // Generate random compressed sparse row matrix. Randomly generated (non-zero)
+  // values are stored in a 1-D (1 rank) array.
+  bsrMat_t A = KokkosSparse::Impl::kk_generate_sparse_matrix<bsrMat_t>(
+      blkDim, m, k, nnz, row_size_variance, bandwidth);
+  bsrMat_t B = KokkosSparse::Impl::kk_generate_sparse_matrix<bsrMat_t>(
+      blkDim, k, n, nnz, row_size_variance, bandwidth);
+
+  const bool is_empy_case = m < 1 || n < 1 || k < 1 || nnz < 1;
+
+  bsrMat_t output_mat2;
+  run_block_spgemm(A, B, output_mat2, SPGEMM_DEBUG, use_dynamic_scheduling,
+                   shared_memory_size);
+
+  std::vector<SPGEMMAlgorithm> algorithms = {
+      SPGEMM_KK,
+      SPGEMM_KK_MEMORY /* alias SPGEMM_KK_MEMSPEED */,
+      SPGEMM_KK_SPEED /* alias SPGEMM_KK_DENSE */,
+      SPGEMM_MKL /* verify failure in case of missing build */,
+  };
+
+  if (!KokkosKernels::Impl::kk_is_gpu_exec_space<
+          typename device::execution_space>()) {
+    // SPGEMM_KK_LP is useful on CPU to cover MultiCoreTag4 functor
+    // (otherwise skipped) but on GPU it's same as SPGEMM_KK, so we can skip it.
+    algorithms.push_back(SPGEMM_KK_LP);
+  }
+
+  for (auto spgemm_algorithm : algorithms) {
+    const uint64_t max_integer = Kokkos::ArithTraits<int>::max();
+    std::string algo           = "UNKNOWN";
+    bool is_expected_to_fail   = false;
+
+    switch (spgemm_algorithm) {
+      case SPGEMM_CUSPARSE:
+        // TODO: add these test failure cases for cusparse too.
+        algo = "SPGEMM_CUSPARSE";
+#ifndef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+        is_expected_to_fail = true;
+#endif
+        break;
+
+      case SPGEMM_MKL:
+        algo                = "SPGEMM_MKL";
+        is_expected_to_fail = !is_empy_case;  // TODO: add block MKL impl
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+        if (!KokkosSparse::Impl::mkl_is_supported_value_type<scalar_t>::value) {
+          is_expected_to_fail = true;
+        }
+#else
+        is_expected_to_fail = true;  // fail: MKL not enabled in build
+#endif
+        // MKL requires local ordinals to be int.
+        // Note: empty-array special case will NOT fail on this.
+        if (!std::is_same<int, lno_t>::value && !is_empy_case) {
+          is_expected_to_fail = true;
+        }
+        // if size_type is larger than int, mkl casts it to int.
+        // it will fail if casting cause overflow.
+        if (A.values.extent(0) > max_integer) {
+          is_expected_to_fail = true;
+        }
+        break;
+
+      case SPGEMM_KK: algo = "SPGEMM_KK"; break;
+      case SPGEMM_KK_LP: algo = "SPGEMM_KK_LP"; break;
+      case SPGEMM_KK_MEMSPEED: algo = "SPGEMM_KK_MEMSPEED"; break;
+      case SPGEMM_KK_SPEED: algo = "SPGEMM_KK_SPEED"; break;
+      case SPGEMM_KK_MEMORY: algo = "SPGEMM_KK_MEMORY"; break;
+      default: algo = "!!! UNKNOWN ALGO !!!";
+    }
+
+    Kokkos::Timer timer1;
+    bsrMat_t output_mat;
+
+    bool failed = false;
+    int res     = 0;
+    try {
+      res = run_block_spgemm(A, B, output_mat, spgemm_algorithm,
+                             use_dynamic_scheduling, shared_memory_size);
+    } catch (const char *message) {
+      EXPECT_TRUE(is_expected_to_fail) << algo << ": " << message;
+      failed = true;
+    } catch (std::string message) {
+      EXPECT_TRUE(is_expected_to_fail) << algo << ": " << message;
+      failed = true;
+    } catch (std::exception &e) {
+      EXPECT_TRUE(is_expected_to_fail) << algo << ": " << e.what();
+      failed = true;
+    }
+    EXPECT_EQ(is_expected_to_fail, failed);
+
+    // double spgemm_time = timer1.seconds();
+
+    timer1.reset();
+    if (!is_expected_to_fail) {
+      EXPECT_TRUE((res == 0)) << algo;
+      bool is_identical = is_same_block_matrix(output_mat, output_mat2);
+      EXPECT_TRUE(is_identical) << algo;
+      // EXPECT_TRUE( equal) << algo;
+    }
+    // std::cout << "algo:" << algo << " spgemm_time:" << spgemm_time << "
+    // output_check_time:" << timer1.seconds() << std::endl;
+  }
+  // device::execution_space::finalize();
+}
+
+// Note: Tests with shared memory specified aim to trigger specific GPU functors
+//       dispatched by matrix size and the available shared memory.
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)        \
+  TEST_F(TestCategory,                                                     \
+         sparse_block_spgemm_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    auto const SHMEM_AUTO = 0;                                             \
+    auto test_case        = test_bspgemm<SCALAR, ORDINAL, OFFSET, DEVICE>; \
+    /* Trigger SPGEMM_KK_MEMORY_SPREADTEAM on GPU */                       \
+    test_case(2, 50, 50, 50, 2000, 50, 5, true, 16 * 1024);                \
+    /* Trigger SPGEMM_KK -> SPGEMM_KK_MEMORY on GPU */                     \
+    test_case(2, 50, 50, 50, 1000, 50, 5, false, 16 * 1024);               \
+    /* Trigger SPGEMM_KK_MEMORY_BIGSPREADTEAM on GPU */                    \
+    test_case(2, 500, 500, 500, 32000, 500, 500, true, 16 * 1024);         \
+    /* trigger dense dispatch in hash method */                            \
+    test_case(2, 2, 3, 4, 2, 2, 0, true, 16 * 1024);                       \
+    /* zero-size handling */                                               \
+    test_case(2, 0, 0, 0, 0, 10, 10, true, SHMEM_AUTO);                    \
+    test_case(2, 0, 12, 5, 0, 10, 0, true, SHMEM_AUTO);                    \
+    test_case(2, 10, 10, 0, 0, 10, 10, true, SHMEM_AUTO);                  \
+  }
+
+#include <Test_Common_Test_All_Type_Combos.hpp>
+
+#undef KOKKOSKERNELS_EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_csc2csr.hpp b/unit_test/sparse/Test_Sparse_csc2csr.hpp
new file mode 100644
index 0000000000..e7d2ad868e
--- /dev/null
+++ b/unit_test/sparse/Test_Sparse_csc2csr.hpp
@@ -0,0 +1,164 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include "KokkosSparse_csc2csr.hpp"
+#include "KokkosKernels_TestUtils.hpp"
+
+namespace Test {
+template <class ScalarType, class LayoutType, class ExeSpaceType>
+void doCsc2Csr(size_t m, size_t n, ScalarType min_val, ScalarType max_val,
+               bool fully_sparse = false) {
+  RandCscMat<ScalarType, LayoutType, ExeSpaceType> cscMat(
+      m, n, min_val, max_val, fully_sparse);
+  constexpr int league_size = 32;
+
+  auto csrMat = KokkosSparse::csc2csr(
+      cscMat.get_m(), cscMat.get_n(), cscMat.get_nnz(), cscMat.get_vals(),
+      cscMat.get_row_ids(), cscMat.get_col_map(), league_size);
+
+  auto csc_row_ids_d = cscMat.get_row_ids();
+  auto csc_col_map_d = cscMat.get_col_map();
+  auto csc_vals_d    = cscMat.get_vals();
+
+  using ViewTypeRowIds = decltype(csc_row_ids_d);
+  using ViewTypeColMap = decltype(csc_col_map_d);
+  using ViewTypeVals   = decltype(csc_vals_d);
+
+  // Copy to host
+  typename ViewTypeRowIds::HostMirror csc_row_ids =
+      Kokkos::create_mirror_view(csc_row_ids_d);
+  Kokkos::deep_copy(csc_row_ids, csc_row_ids_d);
+  typename ViewTypeColMap::HostMirror csc_col_map =
+      Kokkos::create_mirror_view(csc_col_map_d);
+  Kokkos::deep_copy(csc_col_map, csc_col_map_d);
+  typename ViewTypeVals::HostMirror csc_vals =
+      Kokkos::create_mirror_view(csc_vals_d);
+  Kokkos::deep_copy(csc_vals, csc_vals_d);
+
+  auto csr_col_ids_d = csrMat.graph.entries;
+  auto csr_row_map_d = csrMat.graph.row_map;
+  auto csr_vals_d    = csrMat.values;
+
+  using ViewTypeCsrColIds = decltype(csr_col_ids_d);
+  using ViewTypeCsrRowMap = decltype(csr_row_map_d);
+  using ViewTypeCsrVals   = decltype(csr_vals_d);
+
+  // Copy to host
+  typename ViewTypeCsrColIds::HostMirror csr_col_ids =
+      Kokkos::create_mirror_view(csr_col_ids_d);
+  Kokkos::deep_copy(csr_col_ids, csr_col_ids_d);
+  typename ViewTypeCsrRowMap::HostMirror csr_row_map =
+      Kokkos::create_mirror_view(csr_row_map_d);
+  Kokkos::deep_copy(csr_row_map, csr_row_map_d);
+  typename ViewTypeCsrVals::HostMirror csr_vals =
+      Kokkos::create_mirror_view(csr_vals_d);
+  Kokkos::deep_copy(csr_vals, csr_vals_d);
+
+  Kokkos::fence();
+
+  for (int j = 0; j < cscMat.get_n(); ++j) {
+    auto col_start = csc_col_map(j);
+    auto col_len   = csc_col_map(j + 1) - col_start;
+
+    for (int k = 0; k < col_len; ++k) {
+      auto i = col_start + k;
+
+      auto row_start = csr_row_map(csc_row_ids(i));
+      auto row_len   = csr_row_map(csc_row_ids(i) + 1) - row_start;
+      auto row_end   = row_start + row_len;
+
+      if (row_len == 0) continue;
+
+      // Linear search for corresponding element in csr matrix
+      int l = row_start;
+      while (l < row_end && csr_col_ids(l) != j) {
+        ++l;
+      }
+
+      if (l == row_end)
+        FAIL() << "csr element at (i: " << csc_row_ids(i) << ", j: " << j
+               << ") not found!" << std::endl;
+
+      ASSERT_EQ(csc_vals(i), csr_vals(l))
+          << "(i: " << csc_row_ids(i) << ", j: " << j << ")" << std::endl;
+    }
+  }
+}
+
+template <class LayoutType, class ExeSpaceType>
+void doAllScalarsCsc2Csr(size_t m, size_t n, int min, int max) {
+  doCsc2Csr<float, LayoutType, ExeSpaceType>(m, n, min, max);
+  doCsc2Csr<double, LayoutType, ExeSpaceType>(m, n, min, max);
+  doCsc2Csr<Kokkos::complex<float>, LayoutType, ExeSpaceType>(m, n, min, max);
+  doCsc2Csr<Kokkos::complex<double>, LayoutType, ExeSpaceType>(m, n, min, max);
+}
+
+template <class ExeSpaceType>
+void doAllLayoutsCsc2Csr(size_t m, size_t n, int min, int max) {
+  doAllScalarsCsc2Csr<Kokkos::LayoutLeft, ExeSpaceType>(m, n, min, max);
+  doAllScalarsCsc2Csr<Kokkos::LayoutRight, ExeSpaceType>(m, n, min, max);
+}
+
+template <class ExeSpaceType>
+void doAllCsc2csr(size_t m, size_t n) {
+  int min = 1, max = 10;
+  doAllLayoutsCsc2Csr<ExeSpaceType>(m, n, min, max);
+}
+
+TEST_F(TestCategory, sparse_csc2csr) {
+  // Square cases
+  for (size_t dim = 4; dim < 1024; dim *= 4)
+    doAllCsc2csr<TestExecSpace>(dim, dim);
+
+  // Non-square cases
+  for (size_t dim = 1; dim < 1024; dim *= 4) {
+    doAllCsc2csr<TestExecSpace>(dim * 3, dim);
+    doAllCsc2csr<TestExecSpace>(dim, dim * 3);
+  }
+
+  // Fully sparse
+  doCsc2Csr<float, Kokkos::LayoutLeft, TestExecSpace>(5, 5, 1, 10, true);
+  doCsc2Csr<double, Kokkos::LayoutRight, TestExecSpace>(50, 10, 10, 100, true);
+}
+}  // namespace Test
\ No newline at end of file
diff --git a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp
index f255fc4fcf..627a9fc99e 100644
--- a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp
+++ b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp
@@ -47,6 +47,7 @@
 #include <Kokkos_Core.hpp>
 #include "KokkosKernels_Handle.hpp"
 #include "KokkosKernels_IOUtils.hpp"
+#include "KokkosSparse_IOUtils.hpp"
 //#include <Kokkos_Sparse_CrsMatrix.hpp>
 #include <KokkosSparse_spmv.hpp>
 #include <KokkosBlas1_dot.hpp>
@@ -61,7 +62,7 @@
 #include "KokkosSparse_gauss_seidel.hpp"
 #include "KokkosSparse_partitioning_impl.hpp"
 #include "KokkosSparse_sor_sequential_impl.hpp"
-#include "KokkosKernels_Sorting.hpp"
+#include "KokkosSparse_SortCrs.hpp"
 #include "KokkosKernels_TestUtils.hpp"
 
 // #ifndef kokkos_complex_double
@@ -183,7 +184,7 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth,
   srand(245);
   lno_t numCols = numRows;
   crsMat_t input_mat =
-      KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+      KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix<
           crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth);
   if (symmetric) {
     // Symmetrize on host, rather than relying on the parallel versions (those
@@ -272,7 +273,7 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth,
 
   lno_t numCols = numRows;
   crsMat_t input_mat =
-      KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+      KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix<
           crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth);
   if (symmetric) {
     // Symmetrize on host, rather than relying on the parallel versions (those
@@ -396,7 +397,7 @@ void test_sequential_sor(lno_t numRows, size_type nnz, lno_t bandwidth,
           crsMat_t;
   lno_t numCols = numRows;
   crsMat_t input_mat =
-      KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+      KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix<
           crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth);
   auto rowmap  = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
                                                     input_mat.graph.row_map);
@@ -472,7 +473,7 @@ void test_balloon_clustering(lno_t numRows, size_type nnzPerRow,
   srand(245);
   size_type nnzTotal = nnzPerRow * numRows;
   lno_t nnzVariance  = nnzPerRow / 4;
-  crsMat_t A         = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+  crsMat_t A         = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat_t>(
       numRows, numRows, nnzTotal, nnzVariance, bandwidth);
   lno_row_view_t symRowmap;
   lno_nnz_view_t symEntries;
@@ -609,7 +610,7 @@ void test_gauss_seidel_long_rows(lno_t numRows, lno_t numLongRows,
                                     rowmap.data(), numRows + 1));
   crsMat_t input_mat("A", numRows, numRows, totalEntries, valuesView,
                      rowmapView, entriesView);
-  input_mat = KokkosKernels::sort_and_merge_matrix(input_mat);
+  input_mat = KokkosSparse::sort_and_merge_matrix(input_mat);
   if (symmetric) {
     // Symmetrize on host, rather than relying on the parallel versions (those
     // can be tested for symmetric=false)
@@ -660,11 +661,11 @@ void test_gauss_seidel_custom_coloring(lno_t numRows, lno_t nnzPerRow) {
   const scalar_t one = Kokkos::ArithTraits<scalar_t>::one();
   size_type nnz      = nnzPerRow * numRows;
   crsMat_t input_mat =
-      KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+      KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix<
           crsMat_t>(numRows, numRows, nnz, 0, numRows / 10, 2.0 * one);
   input_mat =
       Test::symmetrize<scalar_t, lno_t, size_type, device, crsMat_t>(input_mat);
-  input_mat = KokkosKernels::sort_and_merge_matrix(input_mat);
+  input_mat = KokkosSparse::sort_and_merge_matrix(input_mat);
   scalar_view_t solution_x(
       Kokkos::view_alloc(Kokkos::WithoutInitializing, "X (correct)"), numRows);
   create_random_x_vector(solution_x);
@@ -689,7 +690,7 @@ void test_gauss_seidel_custom_coloring(lno_t numRows, lno_t nnzPerRow) {
   EXPECT_LT(result_norm_res, 0.25 * initial_norm_res);
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                                          \
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                            \
   TEST_F(                                                                                      \
       TestCategory,                                                                            \
       sparse##_##gauss_seidel_asymmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
@@ -743,132 +744,6 @@ void test_gauss_seidel_custom_coloring(lno_t numRows, lno_t nnzPerRow) {
                                                                        10);                    \
   }
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, int, TestExecSpace)
-#endif
+#include <Test_Common_Test_All_Type_Combos.hpp>
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
-#endif
-
-#undef EXECUTE_TEST
+#undef KOKKOSKERNELS_EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_replaceSumInto.hpp b/unit_test/sparse/Test_Sparse_replaceSumInto.hpp
index dc51be7f7b..4036e7ddbd 100644
--- a/unit_test/sparse/Test_Sparse_replaceSumInto.hpp
+++ b/unit_test/sparse/Test_Sparse_replaceSumInto.hpp
@@ -266,139 +266,13 @@ void test_replaceSumInto() {
   EXPECT_TRUE(success);
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                           \
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)             \
   TEST_F(                                                                       \
       TestCategory,                                                             \
       sparse##_##replaceSumInto##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
     test_replaceSumInto<SCALAR, ORDINAL, OFFSET, DEVICE>();                     \
   }
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
-#endif
-
-#undef EXECUTE_TEST
+#include <Test_Common_Test_All_Type_Combos.hpp>
+
+#undef KOKKOSKERNELS_EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp b/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp
index 1c0e279366..e5e1266e1d 100644
--- a/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp
+++ b/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp
@@ -509,7 +509,7 @@ void test_replaceSumIntoLonger() {
   EXPECT_TRUE(success);
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                                 \
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                   \
   TEST_F(                                                                             \
       TestCategory,                                                                   \
       sparse##_##replaceSumIntoLonger##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
@@ -518,133 +518,9 @@ void test_replaceSumIntoLonger() {
 
 // FIXME SYCL: test hangs or gives "CL error -46 invalid kernel name"
 #ifndef KOKKOS_ENABLE_SYCL
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
-#endif
-#endif
-
-#undef EXECUTE_TEST
+
+#include <Test_Common_Test_All_Type_Combos.hpp>
+
+#undef KOKKOSKERNELS_EXECUTE_TEST
+
+#endif  // KOKKOS_ENABLE_SYCL
diff --git a/unit_test/sparse/Test_Sparse_rocsparse.hpp b/unit_test/sparse/Test_Sparse_rocsparse.hpp
index 27e0b1f9fd..fe1bf8e9b2 100644
--- a/unit_test/sparse/Test_Sparse_rocsparse.hpp
+++ b/unit_test/sparse/Test_Sparse_rocsparse.hpp
@@ -7,7 +7,7 @@
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
 #include <rocsparse.h>
-#include "KokkosKernels_SparseUtils_rocsparse.hpp"
+#include "KokkosSparse_Utils_rocsparse.hpp"
 
 void test_rocsparse_version() {
   // Print version
diff --git a/unit_test/sparse/Test_Sparse_spadd.hpp b/unit_test/sparse/Test_Sparse_spadd.hpp
index 01c1aad2b9..881f891837 100644
--- a/unit_test/sparse/Test_Sparse_spadd.hpp
+++ b/unit_test/sparse/Test_Sparse_spadd.hpp
@@ -250,7 +250,7 @@ void test_spadd_known_columns() {
   ASSERT_EQ(A.nnz(), C.nnz());
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                                 \
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                   \
   TEST_F(                                                                             \
       TestCategory,                                                                   \
       sparse##_##spadd_sorted_input##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {   \
@@ -269,132 +269,6 @@ void test_spadd_known_columns() {
     test_spadd<SCALAR, ORDINAL, OFFSET, DEVICE>(50, 50, 75, 100, false);              \
   }
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, int, TestExecSpace)
-#endif
+#include <Test_Common_Test_All_Type_Combos.hpp>
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
-#endif
-
-#undef EXECUTE_TEST
+#undef KOKKOSKERNELS_EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_spgemm.hpp b/unit_test/sparse/Test_Sparse_spgemm.hpp
index dd22bb90dc..f52306ef74 100644
--- a/unit_test/sparse/Test_Sparse_spgemm.hpp
+++ b/unit_test/sparse/Test_Sparse_spgemm.hpp
@@ -45,9 +45,8 @@
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
 
-#include "KokkosKernels_SparseUtils.hpp"
-#include "KokkosKernels_Sorting.hpp"
-#include <Kokkos_Concepts.hpp>
+#include "KokkosSparse_Utils.hpp"
+#include "KokkosSparse_SortCrs.hpp"
 #include <string>
 #include <stdexcept>
 
@@ -58,6 +57,7 @@
 #include <Kokkos_Core.hpp>
 
 #include <KokkosKernels_IOUtils.hpp>
+#include <KokkosSparse_IOUtils.hpp>
 
 // This file contains the matrix for test_issue402
 #include "matrixIssue402.hpp"
@@ -197,8 +197,8 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) {
     return false;
   }
 
-  KokkosKernels::sort_crs_matrix(output_mat_actual);
-  KokkosKernels::sort_crs_matrix(output_mat_reference);
+  KokkosSparse::sort_crs_matrix(output_mat_actual);
+  KokkosSparse::sort_crs_matrix(output_mat_reference);
 
   bool is_identical = true;
   is_identical      = KokkosKernels::Impl::kk_is_identical_view<
@@ -229,7 +229,7 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) {
 
   typedef typename Kokkos::Details::ArithTraits<
       typename scalar_view_t::non_const_value_type>::mag_type eps_type;
-  eps_type eps = std::is_same<eps_type, float>::value ? 2 * 1e-3 : 1e-7;
+  eps_type eps = std::is_same<eps_type, float>::value ? 3.7e-3 : 1e-7;
 
   is_identical = KokkosKernels::Impl::kk_is_relatively_identical_view<
       scalar_view_t, scalar_view_t, eps_type, typename device::execution_space>(
@@ -264,11 +264,13 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth,
 
   // Generate random compressed sparse row matrix. Randomly generated (non-zero)
   // values are stored in a 1-D (1 rank) array.
-  crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+  crsMat_t A = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat_t>(
       m, k, nnz, row_size_variance, bandwidth);
-  crsMat_t B = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+  crsMat_t B = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat_t>(
       k, n, nnz, row_size_variance, bandwidth);
 
+  const bool is_empy_case = m < 1 || n < 1 || k < 1 || nnz < 1;
+
   crsMat_t output_mat2;
   if (oldInterface)
     run_spgemm_old_interface<crsMat_t, device>(A, B, SPGEMM_DEBUG, output_mat2);
@@ -280,12 +282,12 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth,
       SPGEMM_KK_SPEED /* alias SPGEMM_KK_DENSE */
   };
 
-#ifdef HAVE_KOKKOSKERNELS_MKL
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
   algorithms.push_back(SPGEMM_MKL);
 #endif
 
   for (auto spgemm_algorithm : algorithms) {
-    const uint64_t max_integer = 2147483647;
+    const uint64_t max_integer = Kokkos::ArithTraits<int>::max();
     std::string algo           = "UNKNOWN";
     bool is_expected_to_fail   = false;
 
@@ -299,15 +301,15 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth,
 #endif
         break;
 
-      case SPGEMM_MKL:
-        algo = "SPGEMM_MKL";
-        // MKL requires scalar to be either float or double
-        if (!(std::is_same<float, scalar_t>::value ||
-              std::is_same<double, scalar_t>::value)) {
+      case SPGEMM_MKL: algo = "SPGEMM_MKL";
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+        if (!KokkosSparse::Impl::mkl_is_supported_value_type<scalar_t>::value) {
           is_expected_to_fail = true;
         }
-        // mkl requires local ordinals to be int.
-        if (!(std::is_same<int, lno_t>::value)) {
+#endif
+        // MKL requires local ordinals to be int.
+        // Note: empty-array special case will NOT fail on this.
+        if (!std::is_same<int, lno_t>::value && !is_empy_case) {
           is_expected_to_fail = true;
         }
         // if size_type is larger than int, mkl casts it to int.
@@ -315,12 +317,6 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth,
         if (A.values.extent(0) > max_integer) {
           is_expected_to_fail = true;
         }
-
-        if (!(Kokkos::SpaceAccessibility<
-                typename Kokkos::HostSpace::execution_space,
-                typename device::memory_space>::accessible)) {
-          is_expected_to_fail = true;
-        }
         break;
 
       case SPGEMM_KK: algo = "SPGEMM_KK"; break;
@@ -352,7 +348,7 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth,
       EXPECT_TRUE(is_expected_to_fail) << algo << ": " << e.what();
       failed = true;
     }
-    EXPECT_TRUE((failed == is_expected_to_fail));
+    EXPECT_EQ(is_expected_to_fail, failed);
 
     // double spgemm_time = timer1.seconds();
 
@@ -407,7 +403,7 @@ void test_issue402() {
   lno_view_t Browmap("B = A^T rowmap", numRows + 1);
   lno_nnz_view_t Bentries("B = A^T entries", nnz);
   scalar_view_t Bvalues("B = A^T values", nnz);
-  KokkosKernels::Impl::transpose_matrix<
+  KokkosSparse::Impl::transpose_matrix<
       lno_view_t, lno_nnz_view_t, scalar_view_t, lno_view_t, lno_nnz_view_t,
       scalar_view_t, lno_view_t, typename device::execution_space>(
       numRows, numRows, Arowmap, Aentries, Avalues, Browmap, Bentries, Bvalues);
@@ -437,7 +433,7 @@ void test_issue402() {
       << "KKMEM still has issue 402 bug; C=AA' is incorrect!\n";
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                          \
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)            \
   TEST_F(TestCategory,                                                         \
          sparse##_##spgemm##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {     \
     test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(10000, 10000, 10000,          \
@@ -458,132 +454,6 @@ void test_issue402() {
 // test_spgemm<SCALAR,ORDINAL,OFFSET,DEVICE>(50000, 50000 * 30, 100, 10);
 // test_spgemm<SCALAR,ORDINAL,OFFSET,DEVICE>(50000, 50000 * 30, 200, 10);
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
-#endif
+#include <Test_Common_Test_All_Type_Combos.hpp>
 
-#undef EXECUTE_TEST
+#undef KOKKOSKERNELS_EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp b/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp
index 6f416e6f59..4ac707c249 100644
--- a/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp
+++ b/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp
@@ -45,9 +45,8 @@
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
 
-#include "KokkosKernels_SparseUtils.hpp"
-#include "KokkosKernels_Sorting.hpp"
-#include <Kokkos_Concepts.hpp>
+#include "KokkosSparse_Utils.hpp"
+#include "KokkosSparse_SortCrs.hpp"
 #include <string>
 #include <stdexcept>
 
@@ -58,6 +57,7 @@
 #include <Kokkos_Core.hpp>
 
 #include <KokkosKernels_IOUtils.hpp>
+#include <KokkosSparse_IOUtils.hpp>
 
 using namespace KokkosSparse;
 using namespace KokkosSparse::Experimental;
@@ -154,7 +154,7 @@ bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2) {
   size_t nentries2 = output_mat2.graph.entries.extent(0);
   size_t nvals2    = output_mat2.values.extent(0);
 
-  KokkosKernels::sort_crs_matrix(output_mat1);
+  KokkosSparse::sort_crs_matrix(output_mat1);
 
   if (nrows1 != nrows2) {
     std::cout << "nrows1:" << nrows1 << " nrows2:" << nrows2 << std::endl;
@@ -170,7 +170,7 @@ bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2) {
     return false;
   }
 
-  KokkosKernels::sort_crs_matrix(output_mat2);
+  KokkosSparse::sort_crs_matrix(output_mat2);
 
   bool is_identical = true;
   is_identical      = KokkosKernels::Impl::kk_is_identical_view<
@@ -225,7 +225,7 @@ void test_spgemm_jacobi(lno_t numRows, size_type nnz, lno_t bandwidth,
 
   lno_t numCols = numRows;
   crsMat_t input_mat =
-      KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+      KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix<
           crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth);
 
   crsMat_t output_mat2;
@@ -258,7 +258,7 @@ void test_spgemm_jacobi(lno_t numRows, size_type nnz, lno_t bandwidth,
   EXPECT_TRUE(is_identical);
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                          \
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)            \
   TEST_F(                                                                      \
       TestCategory,                                                            \
       sparse##_##spgemm_jacobi##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
@@ -266,132 +266,6 @@ void test_spgemm_jacobi(lno_t numRows, size_type nnz, lno_t bandwidth,
                                                         10);                   \
   }
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
-#endif
-
-#undef EXECUTE_TEST
+#include <Test_Common_Test_All_Type_Combos.hpp>
+
+#undef KOKKOSKERNELS_EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_spiluk.hpp b/unit_test/sparse/Test_Sparse_spiluk.hpp
index 31bd4b47ec..863bdf0808 100644
--- a/unit_test/sparse/Test_Sparse_spiluk.hpp
+++ b/unit_test/sparse/Test_Sparse_spiluk.hpp
@@ -45,11 +45,10 @@
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
 
-#include <Kokkos_Concepts.hpp>
 #include <string>
 #include <stdexcept>
 
-#include "KokkosKernels_SparseUtils.hpp"
+#include "KokkosSparse_Utils.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
 #include <KokkosKernels_IOUtils.hpp>
 #include "KokkosBlas1_nrm2.hpp"
@@ -299,142 +298,15 @@ void test_spiluk() {
   Test::run_test_spiluk<scalar_t, lno_t, size_type, device>();
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                      \
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)        \
   TEST_F(TestCategory,                                                     \
          sparse##_##spiluk##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
     test_spiluk<SCALAR, ORDINAL, OFFSET, DEVICE>();                        \
   }
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
-#endif
-
-#if 0
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
-#endif
-
-#endif
-
-#undef EXECUTE_TEST
+#define NO_TEST_COMPLEX
+
+#include <Test_Common_Test_All_Type_Combos.hpp>
+
+#undef KOKKOSKERNELS_EXECUTE_TEST
+#undef NO_TEST_COMPLEX
diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp
index 55c608a11e..8a15153dce 100644
--- a/unit_test/sparse/Test_Sparse_spmv.hpp
+++ b/unit_test/sparse/Test_Sparse_spmv.hpp
@@ -6,6 +6,7 @@
 #include <KokkosKernels_TestUtils.hpp>
 #include <KokkosKernels_Test_Structured_Matrix.hpp>
 #include <KokkosKernels_IOUtils.hpp>
+#include <KokkosSparse_IOUtils.hpp>
 #include <KokkosKernels_Utils.hpp>
 
 #include "KokkosKernels_Controls.hpp"
@@ -22,6 +23,32 @@ typedef Kokkos::Experimental::half_t kokkos_half;
 
 namespace Test {
 
+// Functor checking that the results of SPMV
+// are consistent with a reference sequential
+// implementation of the same operation.
+//
+// Inputs:
+// - _ex_y      the expected result calculated
+//              from the reference implementation
+// - _y         the result from optimized SPMV being
+//              tested for correctness
+// - _eps       the tolerance required to accept the
+//              results as correct
+// - _max_val   the largest possible value that can
+//              be stored as an intermediate result
+//              during the computation
+//
+//  The criteria to assess correctness is
+//     abs(_ex_y - _y) / _max_val < tol
+//
+//  Note: _max_val in the case of SPMV can be computed
+//  as follows. Find the max number of entries per
+//  row in the matrix (max_row_length), also find the
+//  largest value that can be stored in the matrix, x
+//  and y vectors (max_mat, max_x and max_y).
+//
+//     _max_val = beta*max_y
+//                + alpha*max_row_length*max_mat*max_x
 template <class VectorType0, class VectorType1>
 struct fSPMV {
   using value_type = int;
@@ -32,21 +59,24 @@ struct fSPMV {
   VectorType0 expected_y;
   VectorType1 y;
   mag_type eps;
+  mag_type max_val;
 
-  fSPMV(const VectorType0 &_ex_y, const VectorType1 &_y, const mag_type _eps)
-      : expected_y(_ex_y), y(_y), eps(_eps) {}
+  fSPMV(const VectorType0 &_ex_y, const VectorType1 &_y, const mag_type _eps,
+        const mag_type _max_val = ATM::one())
+      : expected_y(_ex_y),
+        y(_y),
+        eps(AT::abs(_eps)),
+        max_val(AT::abs(_max_val)) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const int i, value_type &err) const {
-    const mag_type error =
-        AT::abs(expected_y(i) - y(i)) / (AT::abs(expected_y(i)) > ATM::zero()
-                                             ? AT::abs(expected_y(i))
-                                             : ATM::one());
+    const mag_type error = AT::abs(expected_y(i) - y(i));
 
-    if (error > eps) {
+    if (error > eps * max_val) {
       err++;
-      // printf("expected_y(%d)=%f, y(%d)=%f err=%f, eps=%f\n", i,
-      //        AT::abs(expected_y(i)), i, AT::abs(y(i)), error, eps);
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "expected_y(%d)=%f, y(%d)=%f err=%f, max_error=%f\n", i,
+          AT::abs(expected_y(i)), i, AT::abs(y(i)), error, eps * max_val);
     }
   }
 };
@@ -113,9 +143,12 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
 }
 
 template <typename crsMat_t, typename x_vector_type, typename y_vector_type>
-void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
-                typename y_vector_type::non_const_value_type alpha,
-                typename y_vector_type::non_const_value_type beta, char mode) {
+void check_spmv(
+    crsMat_t input_mat, x_vector_type x, y_vector_type y,
+    typename y_vector_type::non_const_value_type alpha,
+    typename y_vector_type::non_const_value_type beta, char mode,
+    typename Kokkos::ArithTraits<typename crsMat_t::value_type>::mag_type
+        max_val) {
   // typedef typename crsMat_t::StaticCrsGraphType graph_t;
   using ExecSpace        = typename crsMat_t::execution_space;
   using my_exec_space    = Kokkos::RangePolicy<ExecSpace>;
@@ -123,11 +156,8 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
   using y_value_trait    = Kokkos::ArithTraits<y_value_type>;
   using y_value_mag_type = typename y_value_trait::mag_type;
 
-  // y is the quantity being tested here,
-  // so let us use y_value_type to determine
-  // the appropriate tolerance precision.
   const y_value_mag_type eps =
-      std::is_same<y_value_mag_type, float>::value ? 2 * 1e-3 : 1e-7;
+      10 * Kokkos::ArithTraits<y_value_mag_type>::eps();
   bool transposed = (mode == 'T') || (mode == 'H');
   y_vector_type expected_y(
       "expected", transposed ? input_mat.numCols() : input_mat.numRows());
@@ -150,7 +180,8 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
   int num_errors = 0;
   Kokkos::parallel_reduce(
       "KokkosSparse::Test::spmv", my_exec_space(0, y.extent(0)),
-      fSPMV<y_vector_type, y_vector_type>(expected_y, y, eps), num_errors);
+      fSPMV<y_vector_type, y_vector_type>(expected_y, y, eps, max_val),
+      num_errors);
   if (num_errors > 0)
     printf("KokkosSparse::Test::spmv: %i errors of %i with params: %lf %lf\n",
            num_errors, y.extent_int(0), y_value_trait::abs(alpha),
@@ -159,11 +190,13 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
 }
 
 template <typename crsMat_t, typename x_vector_type, typename y_vector_type>
-void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
-                   y_vector_type expected_y,
-                   typename y_vector_type::non_const_value_type alpha,
-                   typename y_vector_type::non_const_value_type beta, int numMV,
-                   char mode) {
+void check_spmv_mv(
+    crsMat_t input_mat, x_vector_type x, y_vector_type y,
+    y_vector_type expected_y,
+    typename y_vector_type::non_const_value_type alpha,
+    typename y_vector_type::non_const_value_type beta, int numMV, char mode,
+    typename Kokkos::ArithTraits<typename crsMat_t::value_type>::mag_type
+        max_val) {
   using ExecSpace        = typename crsMat_t::execution_space;
   using my_exec_space    = Kokkos::RangePolicy<ExecSpace>;
   using y_value_type     = typename y_vector_type::non_const_value_type;
@@ -174,7 +207,7 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
   // so let us use y_value_type to determine
   // the appropriate tolerance precision.
   const y_value_mag_type eps =
-      std::is_same<y_value_mag_type, float>::value ? 2 * 1e-3 : 1e-7;
+      10 * Kokkos::ArithTraits<y_value_mag_type>::eps();
 
   Kokkos::deep_copy(expected_y, y);
 
@@ -205,7 +238,8 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
     int num_errors = 0;
     Kokkos::parallel_reduce(
         "KokkosSparse::Test::spmv_mv", my_exec_space(0, y_i.extent(0)),
-        fSPMV<decltype(y_i), decltype(y_spmv)>(y_i, y_spmv, eps), num_errors);
+        fSPMV<decltype(y_i), decltype(y_spmv)>(y_i, y_spmv, eps, max_val),
+        num_errors);
     if (num_errors > 0)
       std::cout << "KokkosSparse::Test::spmv_mv: " << num_errors
                 << " errors of " << y_i.extent_int(0) << " for mv " << i
@@ -223,7 +257,9 @@ void check_spmv_struct(
         structure,
     x_vector_type x, y_vector_type y,
     typename y_vector_type::non_const_value_type alpha,
-    typename y_vector_type::non_const_value_type beta) {
+    typename y_vector_type::non_const_value_type beta,
+    typename Kokkos::ArithTraits<typename crsMat_t::value_type>::mag_type
+        max_val) {
   using ExecSpace        = typename crsMat_t::execution_space;
   using my_exec_space    = Kokkos::RangePolicy<ExecSpace>;
   using y_value_type     = typename y_vector_type::non_const_value_type;
@@ -233,9 +269,8 @@ void check_spmv_struct(
   // y is the quantity being tested here,
   // so let us use y_value_type to determine
   // the appropriate tolerance precision.
-  const double eps =
-      std::is_same<y_value_mag_type, float>::value ? 2 * 1e-3 : 1e-7;
-  const size_t nr = input_mat.numRows();
+  const double eps = Kokkos::ArithTraits<y_value_mag_type>::eps();
+  const size_t nr  = input_mat.numRows();
   y_vector_type expected_y("expected", nr);
   Kokkos::deep_copy(expected_y, y);
   Kokkos::fence();
@@ -247,13 +282,15 @@ void check_spmv_struct(
   int num_errors = 0;
   Kokkos::parallel_reduce(
       "KokkosKernels::UnitTests::spmv_struct", my_exec_space(0, y.extent(0)),
-      fSPMV<y_vector_type, y_vector_type>(expected_y, y, eps), num_errors);
-  if (num_errors > 0)
+      fSPMV<y_vector_type, y_vector_type>(expected_y, y, eps, max_val),
+      num_errors);
+  if (num_errors > 0) {
     printf(
         "KokkosKernels::UnitTests::spmv_struct: %i errors of %i with params: "
         "%d %lf %lf\n",
         num_errors, y.extent_int(0), stencil_type, y_value_trait::abs(alpha),
         y_value_trait::abs(beta));
+  }
   EXPECT_TRUE(num_errors == 0);
 }  // check_spmv_struct
 
@@ -265,7 +302,9 @@ void check_spmv_mv_struct(
         structure,
     x_vector_type x, y_vector_type y, y_vector_type expected_y,
     typename y_vector_type::non_const_value_type alpha,
-    typename y_vector_type::non_const_value_type beta, int numMV) {
+    typename y_vector_type::non_const_value_type beta, int numMV,
+    typename Kokkos::ArithTraits<typename crsMat_t::value_type>::mag_type
+        max_val) {
   using ExecSpace        = typename crsMat_t::execution_space;
   using my_exec_space    = Kokkos::RangePolicy<ExecSpace>;
   using y_value_type     = typename y_vector_type::non_const_value_type;
@@ -275,8 +314,7 @@ void check_spmv_mv_struct(
   // y is the quantity being tested here,
   // so let us use y_value_type to determine
   // the appropriate tolerance precision.
-  const double eps =
-      std::is_same<y_value_mag_type, float>::value ? 2 * 1e-3 : 1e-7;
+  const double eps = Kokkos::ArithTraits<y_value_mag_type>::eps();
   Kokkos::deep_copy(expected_y, y);
   Kokkos::fence();
 
@@ -295,7 +333,8 @@ void check_spmv_mv_struct(
     Kokkos::parallel_reduce(
         "KokkosKernels::UnitTests::spmv_mv_struct",
         my_exec_space(0, y.extent(0)),
-        fSPMV<decltype(y_i), decltype(y_spmv)>(y_i, y_spmv, eps), num_errors);
+        fSPMV<decltype(y_i), decltype(y_spmv)>(y_i, y_spmv, eps, max_val),
+        num_errors);
     if (num_errors > 0)
       printf(
           "KokkosKernels::UnitTests::spmv_mv_struct: %i errors of %i with "
@@ -307,10 +346,13 @@ void check_spmv_mv_struct(
 }  // check_spmv_mv_struct
 
 template <typename crsMat_t, typename x_vector_type, typename y_vector_type>
-void check_spmv_controls(KokkosKernels::Experimental::Controls controls,
-                         crsMat_t input_mat, x_vector_type x, y_vector_type y,
-                         typename y_vector_type::non_const_value_type alpha,
-                         typename y_vector_type::non_const_value_type beta) {
+void check_spmv_controls(
+    KokkosKernels::Experimental::Controls controls, crsMat_t input_mat,
+    x_vector_type x, y_vector_type y,
+    typename y_vector_type::non_const_value_type alpha,
+    typename y_vector_type::non_const_value_type beta,
+    typename Kokkos::ArithTraits<typename crsMat_t::value_type>::mag_type
+        max_val) {
   // typedef typename crsMat_t::StaticCrsGraphType graph_t;
   using ExecSpace        = typename crsMat_t::execution_space;
   using my_exec_space    = Kokkos::RangePolicy<ExecSpace>;
@@ -339,7 +381,8 @@ void check_spmv_controls(KokkosKernels::Experimental::Controls controls,
   int num_errors = 0;
   Kokkos::parallel_reduce(
       "KokkosSparse::Test::spmv", my_exec_space(0, y.extent(0)),
-      fSPMV<y_vector_type, y_vector_type>(expected_y, y, eps), num_errors);
+      fSPMV<y_vector_type, y_vector_type>(expected_y, y, eps, max_val),
+      num_errors);
   if (num_errors > 0)
     printf("KokkosSparse::Test::spmv: %i errors of %i with params: %lf %lf\n",
            num_errors, y.extent_int(0), y_value_trait::abs(alpha),
@@ -367,20 +410,27 @@ Kokkos::complex<float> randomUpperBound<Kokkos::complex<float>>(int mag) {
 template <typename scalar_t, typename lno_t, typename size_type, class Device>
 void test_spmv(lno_t numRows, size_type nnz, lno_t bandwidth,
                lno_t row_size_variance, bool heavy) {
-  typedef
-      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type>
-          crsMat_t;
-  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-  typedef scalar_view_t x_vector_type;
-  typedef scalar_view_t y_vector_type;
+  using crsMat_t = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device,
+                                                    void, size_type>;
+  using scalar_view_t = typename crsMat_t::values_type::non_const_type;
+  using x_vector_type = scalar_view_t;
+  using y_vector_type = scalar_view_t;
+  using mag_t         = typename Kokkos::ArithTraits<scalar_t>::mag_type;
+
+  constexpr mag_t max_x   = static_cast<mag_t>(1);
+  constexpr mag_t max_y   = static_cast<mag_t>(1);
+  constexpr mag_t max_val = static_cast<mag_t>(1);
 
   lno_t numCols = numRows;
 
-  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+  crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat_t>(
       numRows, numCols, nnz, row_size_variance, bandwidth);
   lno_t nr = input_mat.numRows();
   lno_t nc = input_mat.numCols();
 
+  const lno_t max_nnz_per_row =
+      numRows ? (nnz / numRows + row_size_variance) : 0;
+
   x_vector_type input_x("x", nc);
   y_vector_type output_y("y", nr);
   x_vector_type input_xt("x", nr);
@@ -389,13 +439,16 @@ void test_spmv(lno_t numRows, size_type nnz, lno_t bandwidth,
   Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
       13718);
 
-  typedef typename x_vector_type::value_type ScalarX;
-  typedef typename y_vector_type::value_type ScalarY;
+  Kokkos::fill_random(input_x, rand_pool, randomUpperBound<scalar_t>(max_x));
+  Kokkos::fill_random(output_y, rand_pool, randomUpperBound<scalar_t>(max_y));
+  Kokkos::fill_random(input_xt, rand_pool, randomUpperBound<scalar_t>(max_x));
+  Kokkos::fill_random(output_yt, rand_pool, randomUpperBound<scalar_t>(max_y));
 
-  Kokkos::fill_random(input_x, rand_pool, randomUpperBound<ScalarX>(1));
-  Kokkos::fill_random(output_y, rand_pool, randomUpperBound<ScalarY>(1));
-  Kokkos::fill_random(input_xt, rand_pool, randomUpperBound<ScalarX>(1));
-  Kokkos::fill_random(output_yt, rand_pool, randomUpperBound<ScalarY>(1));
+  // We also need to bound the values
+  // in the matrix to bound the cancellations
+  // coming from arithmetic operations.
+  Kokkos::fill_random(input_mat.values, rand_pool,
+                      randomUpperBound<scalar_t>(max_val));
 
   std::vector<char> nonTransModes   = {'N'};
   std::vector<char> transModes      = {'T'};
@@ -409,14 +462,21 @@ void test_spmv(lno_t numRows, size_type nnz, lno_t bandwidth,
   for (auto mode : nonTransModes) {
     for (double alpha : testAlphaBeta) {
       for (double beta : testAlphaBeta) {
-        Test::check_spmv(input_mat, input_x, output_y, alpha, beta, mode);
+        mag_t max_error =
+            beta * max_y + alpha * max_nnz_per_row * max_val * max_x;
+        Test::check_spmv(input_mat, input_x, output_y, alpha, beta, mode,
+                         max_error);
       }
     }
   }
   for (auto mode : transModes) {
     for (double alpha : testAlphaBeta) {
       for (double beta : testAlphaBeta) {
-        Test::check_spmv(input_mat, input_xt, output_yt, alpha, beta, mode);
+        // hoping the transpose won't have a long column...
+        mag_t max_error =
+            beta * max_y + alpha * max_nnz_per_row * max_val * max_x;
+        Test::check_spmv(input_mat, input_xt, output_yt, alpha, beta, mode,
+                         max_error);
       }
     }
   }
@@ -426,14 +486,18 @@ template <typename scalar_t, typename lno_t, typename size_type,
           typename layout, class Device>
 void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth,
                   lno_t row_size_variance, bool heavy, int numMV) {
-  lno_t numCols = numRows;
+  using mag_t = typename Kokkos::ArithTraits<scalar_t>::mag_type;
+
+  constexpr mag_t max_x   = static_cast<mag_t>(1);
+  constexpr mag_t max_y   = static_cast<mag_t>(1);
+  constexpr mag_t max_val = static_cast<mag_t>(1);
 
-  typedef
-      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type>
-          crsMat_t;
+  lno_t numCols = numRows;
 
-  typedef Kokkos::View<scalar_t **, layout, Device> ViewTypeX;
-  typedef Kokkos::View<scalar_t **, layout, Device> ViewTypeY;
+  using crsMat_t  = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device,
+                                                    void, size_type>;
+  using ViewTypeX = Kokkos::View<scalar_t **, layout, Device>;
+  using ViewTypeY = Kokkos::View<scalar_t **, layout, Device>;
 
   ViewTypeX b_x("A", numRows, numMV);
   ViewTypeY b_y("B", numCols, numMV);
@@ -445,14 +509,23 @@ void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth,
 
   Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
       13718);
-  Kokkos::fill_random(b_x, rand_pool, randomUpperBound<scalar_t>(1));
-  Kokkos::fill_random(b_y, rand_pool, randomUpperBound<scalar_t>(1));
-  Kokkos::fill_random(b_xt, rand_pool, randomUpperBound<scalar_t>(1));
-  Kokkos::fill_random(b_yt, rand_pool, randomUpperBound<scalar_t>(1));
+  Kokkos::fill_random(b_x, rand_pool, randomUpperBound<scalar_t>(max_x));
+  Kokkos::fill_random(b_y, rand_pool, randomUpperBound<scalar_t>(max_y));
+  Kokkos::fill_random(b_xt, rand_pool, randomUpperBound<scalar_t>(max_x));
+  Kokkos::fill_random(b_yt, rand_pool, randomUpperBound<scalar_t>(max_y));
 
-  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+  crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat_t>(
       numRows, numCols, nnz, row_size_variance, bandwidth);
 
+  const lno_t max_nnz_per_row =
+      numRows ? (nnz / numRows + row_size_variance) : 0;
+
+  // We also need to bound the values
+  // in the matrix to bound the cancellations
+  // coming from arithmetic operations.
+  Kokkos::fill_random(input_mat.values, rand_pool,
+                      randomUpperBound<scalar_t>(max_val));
+
   Kokkos::deep_copy(b_y_copy, b_y);
   Kokkos::deep_copy(b_yt_copy, b_yt);
 
@@ -468,16 +541,21 @@ void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth,
   for (auto mode : nonTransModes) {
     for (double alpha : testAlphaBeta) {
       for (double beta : testAlphaBeta) {
+        mag_t max_error =
+            beta * max_y + alpha * max_nnz_per_row * max_val * max_x;
         Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, alpha, beta, numMV,
-                            mode);
+                            mode, max_error);
       }
     }
   }
   for (auto mode : transModes) {
     for (double alpha : testAlphaBeta) {
       for (double beta : testAlphaBeta) {
+        // hoping the transpose won't have a long column...
+        mag_t max_error =
+            beta * max_y + alpha * max_nnz_per_row * max_val * max_x;
         Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, alpha, beta,
-                            numMV, mode);
+                            numMV, mode, max_error);
       }
     }
   }
@@ -487,18 +565,24 @@ template <typename scalar_t, typename lno_t, typename size_type,
           typename layout, class Device>
 void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth,
                         lno_t row_size_variance, int numMV) {
-  typedef
-      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type>
-          crsMat_t;
+  using crsMat_t  = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device,
+                                                    void, size_type>;
+  using ViewTypeX = Kokkos::View<scalar_t **, layout, Device>;
+  using ViewTypeY = Kokkos::View<scalar_t **, layout, Device>;
+  using mag_t     = typename Kokkos::ArithTraits<scalar_t>::mag_type;
 
-  typedef Kokkos::View<scalar_t **, layout, Device> ViewTypeX;
-  typedef Kokkos::View<scalar_t **, layout, Device> ViewTypeY;
+  constexpr mag_t max_x   = static_cast<mag_t>(10);
+  constexpr mag_t max_y   = static_cast<mag_t>(10);
+  constexpr mag_t max_val = static_cast<mag_t>(10);
 
-  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+  crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat_t>(
       numRows, numRows, nnz, row_size_variance, bandwidth);
   Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
       13718);
 
+  const lno_t max_nnz_per_row =
+      numRows ? (nnz / numRows + row_size_variance) : 0;
+
   for (int nv = 1; nv <= numMV; nv++) {
     ViewTypeX b_x("A", numRows, nv);
     ViewTypeY b_y("B", numRows, nv);
@@ -506,22 +590,30 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth,
 
     Kokkos::fill_random(b_x, rand_pool, scalar_t(10));
     Kokkos::fill_random(b_y, rand_pool, scalar_t(10));
+    Kokkos::fill_random(input_mat.values, rand_pool, scalar_t(10));
 
     Kokkos::deep_copy(b_y_copy, b_y);
 
-    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'N');
-    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'N');
-    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'N');
-    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'T');
-    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'T');
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'N',
+                        max_nnz_per_row * max_val * max_x);
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'N',
+                        max_y);
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'N',
+                        max_y + max_nnz_per_row * max_val * max_x);
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'T',
+                        max_nnz_per_row * max_val * max_x);
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'T',
+                        max_y);
     // Testing all modes together, since matrix is square
     std::vector<char> modes           = {'N', 'C', 'T', 'H'};
     std::vector<double> testAlphaBeta = {0.0, 1.0, -1.0, 2.5};
     for (auto mode : modes) {
       for (double alpha : testAlphaBeta) {
         for (double beta : testAlphaBeta) {
+          mag_t max_error =
+              beta * max_y + alpha * max_nnz_per_row * max_val * max_x;
           Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, alpha, beta, nv,
-                              mode);
+                              mode, max_error);
         }
       }
     }
@@ -535,6 +627,11 @@ void test_spmv_struct_1D(lno_t nx, lno_t leftBC, lno_t rightBC) {
   using scalar_view_t = typename crsMat_t::values_type::non_const_type;
   using x_vector_type = scalar_view_t;
   using y_vector_type = scalar_view_t;
+  using mag_t         = typename Kokkos::ArithTraits<scalar_t>::mag_type;
+
+  constexpr mag_t max_x   = static_cast<mag_t>(1);
+  constexpr mag_t max_y   = static_cast<mag_t>(1);
+  constexpr mag_t max_val = static_cast<mag_t>(2);
 
   Kokkos::View<lno_t *, Kokkos::HostSpace> structure("Spmv Structure", 1);
   structure(0) = nx;
@@ -560,26 +657,31 @@ void test_spmv_struct_1D(lno_t nx, lno_t leftBC, lno_t rightBC) {
   Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
       13718);
 
-  typedef typename x_vector_type::value_type ScalarX;
-  typedef typename y_vector_type::value_type ScalarY;
+  Kokkos::fill_random(input_x, rand_pool, max_x);
+  Kokkos::fill_random(output_y, rand_pool, max_y);
 
-  Kokkos::fill_random(input_x, rand_pool, ScalarX(1));
-  Kokkos::fill_random(output_y, rand_pool, ScalarY(1));
+  const mag_t max_error = max_y + 3 * max_val * max_x;
 
-  Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 1.0, 0.0);
-  Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 0.0, 1.0);
-  Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 1.0, 1.0);
+  Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 1.0, 0.0,
+                          max_error);
+  Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 0.0, 1.0,
+                          max_error);
+  Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 1.0, 1.0,
+                          max_error);
 }
 
 template <typename scalar_t, typename lno_t, typename size_type, class Device>
 void test_spmv_struct_2D(lno_t nx, lno_t ny, lno_t horizontalBC,
                          lno_t verticalBC) {
-  typedef
-      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type>
-          crsMat_t;
-  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-  typedef scalar_view_t x_vector_type;
-  typedef scalar_view_t y_vector_type;
+  using crsMat_t = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device,
+                                                    void, size_type>;
+  using scalar_view_t = typename crsMat_t::values_type::non_const_type;
+  using x_vector_type = scalar_view_t;
+  using y_vector_type = scalar_view_t;
+  using mag_t         = typename Kokkos::ArithTraits<scalar_t>::mag_type;
+
+  constexpr mag_t max_x = static_cast<mag_t>(1);
+  constexpr mag_t max_y = static_cast<mag_t>(1);
 
   Kokkos::View<lno_t *, Kokkos::HostSpace> structure("Spmv Structure", 2);
   structure(0) = nx;
@@ -615,36 +717,44 @@ void test_spmv_struct_2D(lno_t nx, lno_t ny, lno_t horizontalBC,
   Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
       13718);
 
-  typedef typename x_vector_type::value_type ScalarX;
-  typedef typename y_vector_type::value_type ScalarY;
-
-  Kokkos::fill_random(input_x, rand_pool, ScalarX(1));
-  Kokkos::fill_random(output_y, rand_pool, ScalarY(1));
-
-  Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0,
-                          0.0);
-  Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 0.0,
-                          1.0);
-  Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0,
-                          1.0);
-
-  Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0,
-                          0.0);
-  Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 0.0,
-                          1.0);
-  Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0,
-                          1.0);
+  Kokkos::fill_random(input_x, rand_pool, max_x);
+  Kokkos::fill_random(output_y, rand_pool, max_y);
+
+  {
+    constexpr mag_t max_val   = static_cast<mag_t>(4);
+    constexpr mag_t max_error = max_y + 5 * max_val * max_x;
+    Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0,
+                            0.0, max_error);
+    Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 0.0,
+                            1.0, max_error);
+    Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0,
+                            1.0, max_error);
+  }
+
+  {
+    constexpr mag_t max_val   = static_cast<mag_t>(8);
+    constexpr mag_t max_error = max_y + 9 * max_val * max_x;
+    Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0,
+                            0.0, max_error);
+    Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 0.0,
+                            1.0, max_error);
+    Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0,
+                            1.0, max_error);
+  }
 }
 
 template <typename scalar_t, typename lno_t, typename size_type, class Device>
 void test_spmv_struct_3D(lno_t nx, lno_t ny, lno_t nz, lno_t horizontal1BC,
                          lno_t horizontal2BC, lno_t verticalBC) {
-  typedef
-      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type>
-          crsMat_t;
-  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-  typedef scalar_view_t x_vector_type;
-  typedef scalar_view_t y_vector_type;
+  using crsMat_t = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device,
+                                                    void, size_type>;
+  using scalar_view_t = typename crsMat_t::values_type::non_const_type;
+  using x_vector_type = scalar_view_t;
+  using y_vector_type = scalar_view_t;
+  using mag_t         = typename Kokkos::ArithTraits<scalar_t>::mag_type;
+
+  constexpr mag_t max_x = static_cast<mag_t>(1);
+  constexpr mag_t max_y = static_cast<mag_t>(1);
 
   Kokkos::View<lno_t *, Kokkos::HostSpace> structure("Spmv Structure", 3);
   structure(0) = nx;
@@ -688,35 +798,43 @@ void test_spmv_struct_3D(lno_t nx, lno_t ny, lno_t nz, lno_t horizontal1BC,
   Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
       13718);
 
-  typedef typename x_vector_type::value_type ScalarX;
-  typedef typename y_vector_type::value_type ScalarY;
-
-  Kokkos::fill_random(input_x, rand_pool, ScalarX(1));
-  Kokkos::fill_random(output_y, rand_pool, ScalarY(1));
-
-  Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0,
-                          0.0);
-  Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 0.0,
-                          1.0);
-  Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0,
-                          1.0);
-
-  Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0,
-                          0.0);
-  Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 0.0,
-                          1.0);
-  Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0,
-                          1.0);
+  Kokkos::fill_random(input_x, rand_pool, max_x);
+  Kokkos::fill_random(output_y, rand_pool, max_y);
+
+  {
+    constexpr mag_t max_val   = static_cast<mag_t>(6);
+    constexpr mag_t max_error = max_y + 7 * max_val * max_x;
+    Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0,
+                            0.0, max_error);
+    Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 0.0,
+                            1.0, max_error);
+    Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0,
+                            1.0, max_error);
+  }
+
+  {
+    constexpr mag_t max_val   = static_cast<mag_t>(26);
+    constexpr mag_t max_error = max_y + 27 * max_val * max_x;
+    Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0,
+                            0.0, max_error);
+    Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 0.0,
+                            1.0, max_error);
+    Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0,
+                            1.0, max_error);
+  }
 }
 
 template <typename scalar_t, typename lno_t, typename size_type,
           typename layout, class Device>
 void test_spmv_mv_struct_1D(lno_t nx, int numMV) {
-  typedef
-      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type>
-          crsMat_t;
-  typedef Kokkos::View<scalar_t **, layout, Device> x_multivector_type;
-  typedef Kokkos::View<scalar_t **, layout, Device> y_multivector_type;
+  using crsMat_t = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device,
+                                                    void, size_type>;
+  using x_multivector_type = Kokkos::View<scalar_t **, layout, Device>;
+  using y_multivector_type = Kokkos::View<scalar_t **, layout, Device>;
+  using mag_t              = typename Kokkos::ArithTraits<scalar_t>::mag_type;
+
+  constexpr mag_t max_x = static_cast<mag_t>(1);
+  constexpr mag_t max_y = static_cast<mag_t>(1);
 
   Kokkos::View<lno_t *, Kokkos::HostSpace> structure("Spmv Structure", 1);
   structure(0) = nx;
@@ -739,20 +857,19 @@ void test_spmv_mv_struct_1D(lno_t nx, int numMV) {
   Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
       13718);
 
-  typedef typename x_multivector_type::value_type ScalarX;
-  typedef typename y_multivector_type::value_type ScalarY;
+  Kokkos::fill_random(input_x, rand_pool, max_x);
+  Kokkos::fill_random(output_y, rand_pool, max_y);
 
-  Kokkos::fill_random(input_x, rand_pool, ScalarX(10));
-  Kokkos::fill_random(output_y, rand_pool, ScalarY(10));
+  constexpr mag_t max_error = 5;
 
   Kokkos::deep_copy(output_y_copy, output_y);
 
   Test::check_spmv_mv_struct(input_mat, 1, structure, input_x, output_y,
-                             output_y_copy, 1.0, 0.0, numMV);
+                             output_y_copy, 1.0, 0.0, numMV, max_error);
   Test::check_spmv_mv_struct(input_mat, 1, structure, input_x, output_y,
-                             output_y_copy, 0.0, 1.0, numMV);
+                             output_y_copy, 0.0, 1.0, numMV, max_error);
   Test::check_spmv_mv_struct(input_mat, 1, structure, input_x, output_y,
-                             output_y_copy, 1.0, 1.0, numMV);
+                             output_y_copy, 1.0, 1.0, numMV, max_error);
 }
 
 // check that the controls are flowing down correctly in the spmv kernel
@@ -765,10 +882,15 @@ void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth,
   using x_vector_type = scalar_view_t;
   using y_vector_type = scalar_view_t;
   using Controls      = KokkosKernels::Experimental::Controls;
+  using mag_t         = typename Kokkos::ArithTraits<scalar_t>::mag_type;
+
+  constexpr mag_t max_x   = static_cast<mag_t>(10);
+  constexpr mag_t max_y   = static_cast<mag_t>(10);
+  constexpr mag_t max_val = static_cast<mag_t>(10);
 
   lno_t numCols = numRows;
 
-  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+  crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat_t>(
       numRows, numCols, nnz, row_size_variance, bandwidth);
   lno_t nr = input_mat.numRows();
   lno_t nc = input_mat.numCols();
@@ -779,17 +901,20 @@ void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth,
   Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
       13718);
 
-  using ScalarX = typename x_vector_type::value_type;
-  using ScalarY = typename y_vector_type::value_type;
+  Kokkos::fill_random(input_x, rand_pool, max_x);
+  Kokkos::fill_random(output_y, rand_pool, max_y);
+  Kokkos::fill_random(input_mat.values, rand_pool, max_val);
 
-  Kokkos::fill_random(input_x, rand_pool, ScalarX(10));
-  Kokkos::fill_random(output_y, rand_pool, ScalarY(10));
+  const mag_t max_error = max_y + bandwidth * max_val * max_x;
 
   Controls controls;
 
-  Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 0.0);
-  Test::check_spmv_controls(controls, input_mat, input_x, output_y, 0.0, 1.0);
-  Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 1.0);
+  Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 0.0,
+                            max_error);
+  Test::check_spmv_controls(controls, input_mat, input_x, output_y, 0.0, 1.0,
+                            max_error);
+  Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 1.0,
+                            max_error);
 }  // test_spmv_controls
 
 // call it if ordinal int and, scalar float and double are instantiated.
@@ -937,23 +1062,12 @@ void test_github_issue_101() {
   }
 }
 
-#define EXECUTE_TEST_ISSUE_101(DEVICE)                                    \
-  TEST_F(TestCategory, sparse##_##spmv_issue_101##_##OFFSET##_##DEVICE) { \
-    test_github_issue_101<DEVICE>();                                      \
-  }
-
 template <typename CrsMat>
 CrsMat make_block_matrix(typename CrsMat::ordinal_type &numRows,
                          typename CrsMat::ordinal_type &numCols,
                          typename CrsMat::ordinal_type &blockSize) {
-#if 0
-    typedef typename CrsMat::StaticCrsGraphType::row_map_type::non_const_type ptr_type ;
-    typedef typename CrsMat::StaticCrsGraphType::entries_type::non_const_type ind_type ;
-    typedef typename CrsMat::values_type::non_const_type val_type ;
-    typedef typename CrsMat::size_type size_type;
-#endif
-  typedef typename CrsMat::ordinal_type lno_t;
-  typedef typename CrsMat::value_type scalar_t;
+  using lno_t    = typename CrsMat::ordinal_type;
+  using scalar_t = typename CrsMat::value_type;
 
   using Kokkos::HostSpace;
   using Kokkos::MemoryUnmanaged;
@@ -1212,22 +1326,21 @@ template <typename a_scalar_t, typename x_scalar_t, typename y_scalar_t,
 void test_spmv_bsrmatrix_controls_pattern(
     const KokkosKernels::Experimental::Controls &controls,
     const std::vector<Coordinate> &pattern, const int m, const int n,
-    lno_t blockSize, lno_t k, y_scalar_t alpha, y_scalar_t beta) {
+    lno_t blockSize, lno_t k, y_scalar_t alpha, y_scalar_t beta,
+    const int max_blocks_per_row) {
   // get the widest passed scalar type
   // typedef typename std::conditional<sizeof(a_scalar_t) >= sizeof(x_scalar_t),
   //                                   a_scalar_t, x_scalar_t>::type wider_t;
   // typedef typename std::conditional<sizeof(wider_t) >= sizeof(y_scalar_t),
   //                                   wider_t, y_scalar_t>::type widest_t;
 
-  typedef typename KokkosSparse::CrsMatrix<a_scalar_t, lno_t, Device, void,
-                                           size_type>
-      crs_mat_t;
-  typedef
+  using crs_mat_t = typename KokkosSparse::CrsMatrix<a_scalar_t, lno_t, Device,
+                                                     void, size_type>;
+  using bsr_mat_t =
       typename KokkosSparse::Experimental::BsrMatrix<a_scalar_t, lno_t, Device,
-                                                     void, size_type>
-          bsr_mat_t;
-  typedef Kokkos::View<x_scalar_t **, Layout, Device> x_view_t;
-  typedef Kokkos::View<y_scalar_t **, Layout, Device> y_view_t;
+                                                     void, size_type>;
+  using x_view_t = Kokkos::View<x_scalar_t **, Layout, Device>;
+  using y_view_t = Kokkos::View<y_scalar_t **, Layout, Device>;
 
   using DeviceRangePolicy = Kokkos::RangePolicy<Device>;
 
@@ -1248,23 +1361,19 @@ void test_spmv_bsrmatrix_controls_pattern(
   y_view_t test_y("test_y", m * blockSize, k);
   x_view_t test_x("test_x", n * blockSize, k);
 
+  constexpr x_scalar_t max_x = 10;
+  constexpr y_scalar_t max_y = 10;
+  constexpr a_scalar_t max_a = 10;
+  const double max_val =
+      beta * max_y + alpha * max_blocks_per_row * max_a * max_x;
+
   // fill expected with random values
   Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
       13718);
   Kokkos::fill_random(exp_x, rand_pool,
-                      randomUpperBound<typename x_view_t::value_type>(10));
+                      randomUpperBound<typename x_view_t::value_type>(max_x));
   Kokkos::fill_random(exp_y, rand_pool,
-                      randomUpperBound<typename y_view_t::value_type>(10));
-
-#if 0
-  // fill inputs with 1, for help debugging
-  Kokkos::parallel_for("fill",
-    Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>>({0,0}, {hi_x.extent(0), hi_x.extent(1)}),
-    KOKKOS_LAMBDA (unsigned i, unsigned j) { 
-        hi_x(i,j) = 1 + (i == 0 && j == 0); 
-    }
-  );
-#endif
+                      randomUpperBound<typename y_view_t::value_type>(max_y));
 
   // copy expected operands to test operands
   Kokkos::deep_copy(test_x, exp_x);
@@ -1292,11 +1401,11 @@ void test_spmv_bsrmatrix_controls_pattern(
     // uses CUDA's half type, not Kokkos, so we still need a reduced precision
     // test.
     double eps =
-        KOKKOSKERNELS_IMPL_FP16_EPSILON * KOKKOSKERNELS_IMPL_FP16_RADIX;
+        2 * KOKKOSKERNELS_IMPL_FP16_EPSILON * KOKKOSKERNELS_IMPL_FP16_RADIX;
     Kokkos::parallel_reduce("KokkosSparse::Test::spmv_tc",
                             DeviceRangePolicy(0, exp_y_i.extent(0)),
                             Test::fSPMV<decltype(exp_y_i), decltype(test_y_i)>(
-                                exp_y_i, test_y_i, eps),
+                                exp_y_i, test_y_i, eps, max_val),
                             num_errors);
     // explicit cast to double since no overload for half::operator<<
     if (num_errors > 0)
@@ -1318,13 +1427,14 @@ template <typename a_scalar_t, typename x_scalar_t, typename y_scalar_t,
           typename lno_t, typename size_type, typename Layout, typename Device>
 void test_spmv_bsrmatrix_pattern(const std::vector<Coordinate> &pattern,
                                  const int m, const int n, lno_t blockSize,
-                                 lno_t k, y_scalar_t alpha, y_scalar_t beta) {
+                                 lno_t k, y_scalar_t alpha, y_scalar_t beta,
+                                 const int max_blocks_per_row) {
   {
     KokkosKernels::Experimental::Controls controls;
     controls.setParameter("algorithm", "experimental_bsr_tc");
     test_spmv_bsrmatrix_controls_pattern<a_scalar_t, x_scalar_t, y_scalar_t,
                                          lno_t, size_type, Layout, Device>(
-        controls, pattern, m, n, blockSize, k, alpha, beta);
+        controls, pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row);
   }
 
 #if defined(KOKKOS_ARCH_AMPERE)
@@ -1334,7 +1444,7 @@ void test_spmv_bsrmatrix_pattern(const std::vector<Coordinate> &pattern,
     controls.setParameter("tc_precision", "double");
     test_spmv_bsrmatrix_controls_pattern<a_scalar_t, x_scalar_t, y_scalar_t,
                                          lno_t, size_type, Layout, Device>(
-        controls, pattern, m, n, blockSize, k, alpha, beta);
+        controls, pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row);
   }
 #endif
 }
@@ -1352,69 +1462,76 @@ void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha,
   {
     int m                           = 1;
     int n                           = 1;
+    int max_blocks_per_row          = 1;
     std::vector<Coordinate> pattern = {Coordinate(0, 0)};
     test_spmv_bsrmatrix_pattern<a_scalar_t, x_scalar_t, y_scalar_t, lno_t,
                                 size_type, Layout, Device>(
-        pattern, m, n, blockSize, k, alpha, beta);
+        pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row);
   }
 
   // 1x1 empty
   {
     int m                           = 1;
     int n                           = 1;
+    int max_blocks_per_row          = 0;
     std::vector<Coordinate> pattern = {};
     test_spmv_bsrmatrix_pattern<a_scalar_t, x_scalar_t, y_scalar_t, lno_t,
                                 size_type, Layout, Device>(
-        pattern, m, n, blockSize, k, alpha, beta);
+        pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row);
   }
 
   // 2x2 top-left
   {
     int m                           = 2;
     int n                           = 2;
+    int max_blocks_per_row          = 1;
     std::vector<Coordinate> pattern = {Coordinate(0, 0)};
     test_spmv_bsrmatrix_pattern<a_scalar_t, x_scalar_t, y_scalar_t, lno_t,
                                 size_type, Layout, Device>(
-        pattern, m, n, blockSize, k, alpha, beta);
+        pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row);
   }
 
   // 2x2 bottom right
   {
     int m                           = 2;
     int n                           = 2;
+    int max_blocks_per_row          = 1;
     std::vector<Coordinate> pattern = {Coordinate(1, 1)};
     test_spmv_bsrmatrix_pattern<a_scalar_t, x_scalar_t, y_scalar_t, lno_t,
                                 size_type, Layout, Device>(
-        pattern, m, n, blockSize, k, alpha, beta);
+        pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row);
   }
 
   // 2x3 bottom right
   {
     int m                           = 2;
     int n                           = 3;
+    int max_blocks_per_row          = 1;
     std::vector<Coordinate> pattern = {Coordinate(1, 2)};
     test_spmv_bsrmatrix_pattern<a_scalar_t, x_scalar_t, y_scalar_t, lno_t,
                                 size_type, Layout, Device>(
-        pattern, m, n, blockSize, k, alpha, beta);
+        pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row);
   }
 
   // 2x10 long bottom row
   {
-    int m = 2;
-    int n = 10;
+    int m                  = 2;
+    int n                  = 10;
+    int max_blocks_per_row = 10;
     std::vector<Coordinate> pattern;
     for (int j = 0; j < n; ++j) {
       pattern.push_back(Coordinate(1, j));
     }
     test_spmv_bsrmatrix_pattern<a_scalar_t, x_scalar_t, y_scalar_t, lno_t,
                                 size_type, Layout, Device>(
-        pattern, m, n, blockSize, k, alpha, beta);
+        pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row);
   }
 
   // 10x10 column 1 + diagonal
   {
-    int m = 10;
-    int n = 10;
+    int m                  = 10;
+    int n                  = 10;
+    int max_blocks_per_row = 2;
     std::vector<Coordinate> pattern;
     for (int i = 0; i < n; ++i) {
       pattern.push_back(Coordinate(i, 1));
@@ -1424,11 +1541,16 @@ void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha,
     }
     test_spmv_bsrmatrix_pattern<a_scalar_t, x_scalar_t, y_scalar_t, lno_t,
                                 size_type, Layout, Device>(
-        pattern, m, n, blockSize, k, alpha, beta);
+        pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row);
   }
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                          \
+#define EXECUTE_TEST_ISSUE_101(DEVICE)                                    \
+  TEST_F(TestCategory, sparse##_##spmv_issue_101##_##OFFSET##_##DEVICE) { \
+    test_github_issue_101<DEVICE>();                                      \
+  }
+
+#define EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, DEVICE)                       \
   TEST_F(TestCategory,                                                         \
          sparse##_##spmv##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {       \
     test_spmv<SCALAR, ORDINAL, OFFSET, DEVICE>(1000, 1000 * 3, 200, 10, true); \
@@ -1612,469 +1734,42 @@ EXECUTE_TEST_TC(double, double, double, int, size_t, LayoutRight, TestExecSpace)
 EXECUTE_TEST_ISSUE_101(TestExecSpace)
 #endif
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, int, TestExecSpace)
-EXECUTE_TEST_STRUCT(double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, int, TestExecSpace)
-EXECUTE_TEST_STRUCT(double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, size_t, TestExecSpace)
-EXECUTE_TEST_STRUCT(double, int, size_t, TestExecSpace)
-#endif
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
+  EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, TestExecSpace)           \
+  EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, TestExecSpace)
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
-EXECUTE_TEST_STRUCT(double, int64_t, size_t, TestExecSpace)
-#endif
+#include <Test_Common_Test_All_Type_Combos.hpp>
 
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, int, TestExecSpace)
-EXECUTE_TEST_STRUCT(float, int, int, TestExecSpace)
-#endif
+#undef KOKKOSKERNELS_EXECUTE_TEST
 
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
      !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, int, TestExecSpace)
-EXECUTE_TEST_STRUCT(float, int64_t, int, TestExecSpace)
-#endif
 
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, size_t, TestExecSpace)
-EXECUTE_TEST_STRUCT(float, int, size_t, TestExecSpace)
-#endif
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)   \
+  EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) \
+  EXECUTE_TEST_MV_STRUCT(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace)
 
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
-EXECUTE_TEST_STRUCT(float, int64_t, size_t, TestExecSpace)
-#endif
+#include <Test_Common_Test_All_Type_Combos.hpp>
 
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
-EXECUTE_TEST_STRUCT(kokkos_complex_double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
-EXECUTE_TEST_STRUCT(kokkos_complex_double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
-EXECUTE_TEST_STRUCT(kokkos_complex_double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
-EXECUTE_TEST_STRUCT(kokkos_complex_double, int64_t, size_t, TestExecSpace)
-#endif
+#undef KOKKOSKERNELS_EXECUTE_TEST
 
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
-EXECUTE_TEST_STRUCT(kokkos_complex_float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
-EXECUTE_TEST_STRUCT(kokkos_complex_float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
-EXECUTE_TEST_STRUCT(kokkos_complex_float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
-EXECUTE_TEST_STRUCT(kokkos_complex_float, int64_t, size_t, TestExecSpace)
-#endif
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&  \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(double, int, int, LayoutLeft, TestExecSpace)
-EXECUTE_TEST_MV_STRUCT(double, int, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&      \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(double, int64_t, int, LayoutLeft, TestExecSpace)
-EXECUTE_TEST_MV_STRUCT(double, int64_t, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&     \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(double, int, size_t, LayoutLeft, TestExecSpace)
-EXECUTE_TEST_MV_STRUCT(double, int, size_t, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&      \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(double, int64_t, size_t, LayoutLeft, TestExecSpace)
-EXECUTE_TEST_MV_STRUCT(double, int64_t, size_t, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&  \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(float, int, int, LayoutLeft, TestExecSpace)
-EXECUTE_TEST_MV_STRUCT(float, int, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&      \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(float, int64_t, int, LayoutLeft, TestExecSpace)
-EXECUTE_TEST_MV_STRUCT(float, int64_t, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&     \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(float, int, size_t, LayoutLeft, TestExecSpace)
-EXECUTE_TEST_MV_STRUCT(float, int, size_t, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&      \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(float, int64_t, size_t, LayoutLeft, TestExecSpace)
-EXECUTE_TEST_MV_STRUCT(float, int64_t, size_t, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&             \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutLeft, TestExecSpace)
-EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int, int, LayoutLeft,
-                       TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&             \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutLeft, TestExecSpace)
-EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int64_t, int, LayoutLeft,
-                       TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&             \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutLeft, TestExecSpace)
-EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int, size_t, LayoutLeft,
-                       TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&             \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutLeft,
-                TestExecSpace)
-EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int64_t, size_t, LayoutLeft,
-                       TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutLeft, TestExecSpace)
-EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int, int, LayoutLeft,
-                       TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutLeft, TestExecSpace)
-EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int64_t, int, LayoutLeft,
-                       TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutLeft, TestExecSpace)
-EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int, size_t, LayoutLeft,
-                       TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutLeft,
-                TestExecSpace)
-EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int64_t, size_t, LayoutLeft,
-                       TestExecSpace)
-#endif
 #endif  // defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(double, int, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&     \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(double, int64_t, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(double, int, size_t, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&     \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(double, int64_t, size_t, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(float, int, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&     \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(float, int64_t, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
      !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(float, int, size_t, LayoutRight, TestExecSpace)
-#endif
 
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&     \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(float, int64_t, size_t, LayoutRight, TestExecSpace)
-#endif
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
+  EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace)
 
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutRight, TestExecSpace)
-#endif
+#include <Test_Common_Test_All_Type_Combos.hpp>
 
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutRight, TestExecSpace)
-#endif
+#undef KOKKOSKERNELS_EXECUTE_TEST
 
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutRight,
-                TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutRight,
-                TestExecSpace)
-#endif
+#endif  // defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
 
-#undef EXECUTE_TEST
+#undef EXECUTE_TEST_FN
 #undef EXECUTE_TEST_STRUCT
 #undef EXECUTE_TEST_MV
 #undef EXECUTE_TEST_MV_STRUCT
diff --git a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp
index f570a2d5df..a96af6973e 100644
--- a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp
+++ b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp
@@ -42,6 +42,7 @@
 //@HEADER
 */
 
+#include <algorithm>
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
 #include <stdexcept>
@@ -128,36 +129,44 @@ void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta,
     size_type nnz = static_cast<size_type>(blockSize) *
                     static_cast<size_type>(blockSize) * mat_b1.nnz();
 
-    // Fill block with random values
-    std::vector<scalar_t> mat_val(nnz);
-    for (size_type ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]);
-
     //
     // Create graph for CrsMatrix
     //
 
-    std::vector<lno_t> mat_rowmap(nRow + 1, 0);
-    std::vector<lno_t> mat_colidx(nnz, 0);
+    Kokkos::View<size_type *, device> d_rowmap("crsmatrix", nRow + 1);
+    auto h_rowmap = Kokkos::create_mirror_view(d_rowmap);
+
+    Kokkos::View<lno_t *, device> d_colidx("crsmatrix", nnz);
+    auto h_colidx = Kokkos::create_mirror_view(d_colidx);
+
+    Kokkos::View<scalar_t *, device> d_matval("crsmatrix", nnz);
+    auto h_matval = Kokkos::create_mirror_view(d_matval);
+
+    for (size_type ii = 0; ii < nnz; ++ii) set_random_value(h_matval[ii]);
 
     for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) {
-      const auto jbeg = mat_b1.graph.row_map(ir);
-      const auto jend = mat_b1.graph.row_map(ir + 1);
+      const size_type jbeg = mat_b1.graph.row_map(ir);
+      const size_type jend = mat_b1.graph.row_map(ir + 1);
       for (lno_t ib = 0; ib < blockSize; ++ib) {
-        const lno_t my_row     = ir * blockSize + ib;
-        mat_rowmap[my_row + 1] = mat_rowmap[my_row] + (jend - jbeg) * blockSize;
-        for (lno_t ijk = jbeg; ijk < jend; ++ijk) {
+        const lno_t my_row   = ir * blockSize + ib;
+        h_rowmap[my_row + 1] = h_rowmap[my_row] + (jend - jbeg) * blockSize;
+        for (size_type ijk = jbeg; ijk < jend; ++ijk) {
           const auto col0 = mat_b1.graph.entries(ijk);
           for (lno_t jb = 0; jb < blockSize; ++jb) {
-            mat_colidx[mat_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] =
+            h_colidx[h_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] =
                 col0 * blockSize + jb;
           }
         }
       }
     }  // for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir)
 
+    Kokkos::deep_copy(d_matval, h_matval);
+    Kokkos::deep_copy(d_colidx, h_colidx);
+    Kokkos::deep_copy(d_rowmap, h_rowmap);
+
     // Create the CrsMatrix for the reference computation
-    crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], &mat_rowmap[0],
-                  &mat_colidx[0]);
+    crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, d_matval, d_rowmap,
+                  d_colidx);
 
     x_vector_type xref("new_right_hand_side", nRow);
     auto h_xref = Kokkos::create_mirror_view(xref);
@@ -179,7 +188,7 @@ void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta,
     // Compute the reference product
     KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs);
 
-    y_vector_type ybcrs("bsr_product_result", nRow);
+    y_vector_type ybcrs("bcrs_product_result", nRow);
     auto h_ybcrs = Kokkos::create_mirror_view(ybcrs);
     for (lno_t ir = 0; ir < nRow; ++ir) h_ybcrs(ir) = h_y0(ir);
     Kokkos::deep_copy(ybcrs, h_ybcrs);
@@ -187,26 +196,27 @@ void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta,
     // Create the BlockCrsMatrix
     KokkosSparse::Experimental::BlockCrsMatrix<scalar_t, lno_t, device, void,
                                                size_type>
-        Absr(Acrs, blockSize);
+        Abcrs(Acrs, blockSize);
 
     // Compute the product with the BlockCrsMatrix format
-    KokkosSparse::spmv(fOp, alpha, Absr, xref, beta, ybcrs);
+    KokkosSparse::spmv(fOp, alpha, Abcrs, xref, beta, ybcrs);
 
     // Compare the two products
-    double error = 0.0, maxNorm = 0.0;
+    using KATS     = Kokkos::ArithTraits<scalar_t>;
+    using mag_type = typename KATS::mag_type;
+
+    const mag_type zero_mag = Kokkos::ArithTraits<mag_type>::zero();
+    mag_type error = zero_mag, maxNorm = zero_mag;
+
     Kokkos::deep_copy(h_ycrs, ycrs);
     Kokkos::deep_copy(h_ybcrs, ybcrs);
     for (lno_t ir = 0; ir < nRow; ++ir) {
-      error = std::max(
-          error, Kokkos::ArithTraits<scalar_t>::abs(h_ycrs(ir) - h_ybcrs(ir)));
-      maxNorm =
-          std::max(maxNorm, Kokkos::ArithTraits<scalar_t>::abs(h_ycrs(ir)));
+      error   = std::max<mag_type>(error, KATS::abs(h_ycrs(ir) - h_ybcrs(ir)));
+      maxNorm = std::max<mag_type>(maxNorm, KATS::abs(h_ycrs(ir)));
     }
 
-    double tmps =
-        static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(alpha)) +
-        static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(beta));
-    if ((tmps > 0.0) && (maxNorm == 0)) {
+    mag_type tmps = KATS::abs(alpha) + KATS::abs(beta);
+    if ((tmps > zero_mag) && (maxNorm == zero_mag)) {
       std::cout << " BlockCRSMatrix - SpMV times V >> blockSize " << blockSize
                 << " maxNorm " << maxNorm << " error " << error << " alpha "
                 << alpha << " beta " << beta << "\n";
@@ -216,9 +226,8 @@ void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta,
     //
     // --- Factor ((nnz / nRow) + 1) = Average number of non-zeros per row
     //
-    const auto tol = ((nnz / nRow) + 1) *
-                     static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(
-                         Kokkos::ArithTraits<scalar_t>::epsilon()));
+    const mag_type tol = ((static_cast<mag_type>(nnz) / nRow) + 1) *
+                         Kokkos::ArithTraits<mag_type>::epsilon();
     if (error > tol * maxNorm) {
       std::cout << " BlockCRSMatrix - SpMV times V >> blockSize " << blockSize
                 << " ratio " << error / maxNorm << " tol " << tol << " maxNorm "
@@ -231,7 +240,7 @@ void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta,
 
 /// \brief Driver routine for checking BlockCrsMatrix times multiple vector
 template <typename scalar_t, typename lno_t, typename size_type,
-          typename device>
+          typename layout, typename device>
 void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta,
                              const lno_t bMax, int &num_errors) {
   // The mat_structure view is used to generate a matrix using
@@ -255,7 +264,7 @@ void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta,
   typedef
       typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
           crsMat_t;
-  typedef Kokkos::View<scalar_t **, Kokkos::LayoutLeft, device> block_vector_t;
+  typedef Kokkos::View<scalar_t **, layout, device> block_vector_t;
 
   h_crsMat_t mat_b1 =
       Test::generate_structured_matrix3D<h_crsMat_t>("FD", mat_structure);
@@ -273,41 +282,40 @@ void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta,
     size_type nnz = static_cast<size_type>(blockSize) *
                     static_cast<size_type>(blockSize) * mat_b1.nnz();
 
-    std::vector<scalar_t> mat_val(nnz);
-    for (size_type ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]);
+    Kokkos::View<size_type *, device> d_rowmap("crsmatrix", nRow + 1);
+    auto h_rowmap = Kokkos::create_mirror_view(d_rowmap);
 
-    //
-    // Create graph for CrsMatrix
-    //
+    Kokkos::View<lno_t *, device> d_colidx("crsmatrix", nnz);
+    auto h_colidx = Kokkos::create_mirror_view(d_colidx);
 
-    std::vector<lno_t> mat_rowmap(nRow + 1);
-    std::vector<lno_t> mat_colidx(nnz);
+    Kokkos::View<scalar_t *, device> d_matval("crsmatrix", nnz);
+    auto h_matval = Kokkos::create_mirror_view(d_matval);
 
-    mat_rowmap.resize(nRow + 1);
-    auto *rowmap = &mat_rowmap[0];
-    rowmap[0]    = 0;
-
-    mat_colidx.resize(nnz);
-    auto *cols = &mat_colidx[0];
+    for (size_type ii = 0; ii < nnz; ++ii) set_random_value(h_matval[ii]);
 
     for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) {
-      const auto jbeg = mat_b1.graph.row_map(ir);
-      const auto jend = mat_b1.graph.row_map(ir + 1);
+      const size_type jbeg = mat_b1.graph.row_map(ir);
+      const size_type jend = mat_b1.graph.row_map(ir + 1);
       for (lno_t ib = 0; ib < blockSize; ++ib) {
-        const lno_t my_row = ir * blockSize + ib;
-        rowmap[my_row + 1] = rowmap[my_row] + (jend - jbeg) * blockSize;
-        for (lno_t ijk = jbeg; ijk < jend; ++ijk) {
+        const lno_t my_row   = ir * blockSize + ib;
+        h_rowmap[my_row + 1] = h_rowmap[my_row] + (jend - jbeg) * blockSize;
+        for (size_type ijk = jbeg; ijk < jend; ++ijk) {
           const auto col0 = mat_b1.graph.entries(ijk);
           for (lno_t jb = 0; jb < blockSize; ++jb) {
-            cols[rowmap[my_row] + (ijk - jbeg) * blockSize + jb] =
+            h_colidx[h_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] =
                 col0 * blockSize + jb;
           }
         }
       }
     }  // for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir)
 
+    Kokkos::deep_copy(d_matval, h_matval);
+    Kokkos::deep_copy(d_colidx, h_colidx);
+    Kokkos::deep_copy(d_rowmap, h_rowmap);
+
     // Create the CrsMatrix for the reference computation
-    crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], rowmap, cols);
+    crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, d_matval, d_rowmap,
+                  d_colidx);
 
     block_vector_t xref("new_right_hand_side", nRow, nrhs);
     auto h_xref = Kokkos::create_mirror_view(xref);
@@ -329,7 +337,7 @@ void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta,
 
     KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs);
 
-    block_vector_t ybcrs("bsr_product_result", nRow, nrhs);
+    block_vector_t ybcrs("bcrs_product_result", nRow, nrhs);
     auto h_ybcrs = Kokkos::create_mirror_view(ybcrs);
     for (int jc = 0; jc < nrhs; ++jc)
       for (lno_t ir = 0; ir < nRow; ++ir) h_ybcrs(ir, jc) = h_y0(ir, jc);
@@ -338,38 +346,40 @@ void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta,
     // Create the BlockCrsMatrix
     KokkosSparse::Experimental::BlockCrsMatrix<scalar_t, lno_t, device, void,
                                                size_type>
-        Absr(Acrs, blockSize);
+        Abcrs(Acrs, blockSize);
 
     // Compute the product for the BlockCrsMatrix format
-    KokkosSparse::spmv(fOp, alpha, Absr, xref, beta, ybcrs);
+    KokkosSparse::spmv(fOp, alpha, Abcrs, xref, beta, ybcrs);
 
     Kokkos::deep_copy(h_ycrs, ycrs);
     Kokkos::deep_copy(h_ybcrs, ybcrs);
 
     // Compare the two products
-    double error = 0.0, maxNorm = 0.0;
+    using KATS     = Kokkos::ArithTraits<scalar_t>;
+    using mag_type = typename KATS::mag_type;
+
+    const mag_type zero_mag = Kokkos::ArithTraits<mag_type>::zero();
+    mag_type error = zero_mag, maxNorm = zero_mag;
+
     for (int jc = 0; jc < nrhs; ++jc) {
       for (int ir = 0; ir < nRow; ++ir) {
-        error   = std::max(error, Kokkos::ArithTraits<scalar_t>::abs(
-                                    h_ycrs(ir, jc) - h_ybcrs(ir, jc)));
-        maxNorm = std::max(maxNorm,
-                           Kokkos::ArithTraits<scalar_t>::abs(h_ycrs(ir, jc)));
+        error   = std::max<mag_type>(error,
+                                   KATS::abs(h_ycrs(ir, jc) - h_ybcrs(ir, jc)));
+        maxNorm = std::max<mag_type>(maxNorm, KATS::abs(h_ycrs(ir, jc)));
       }
     }
-    auto tol = ((nnz / nRow) + 1) *
-               static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(
-                   Kokkos::ArithTraits<scalar_t>::epsilon()));
-
-    double tmps =
-        static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(alpha)) +
-        static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(beta));
-    if ((tmps > 0.0) && (maxNorm == 0)) {
+
+    const mag_type tmps = KATS::abs(alpha) + KATS::abs(beta);
+    if ((tmps > zero_mag) && (maxNorm == zero_mag)) {
       std::cout << " BlockCRSMatrix - SpMV times MV >> blockSize " << blockSize
                 << " maxNorm " << maxNorm << " error " << error << " alpha "
                 << alpha << " beta " << beta << "\n";
       num_errors += 1;
     }
 
+    const mag_type tol = ((static_cast<mag_type>(nnz) / nRow) + 1) *
+                         Kokkos::ArithTraits<mag_type>::epsilon();
+
     if (error > tol * maxNorm) {
       std::cout << " BlockCRSMatrix - SpMV times MV >> blockSize " << blockSize
                 << " ratio " << error / maxNorm << " tol " << tol << " maxNorm "
@@ -425,7 +435,7 @@ void testSpMVBlockCrsMatrix() {
 }
 
 template <typename scalar_t, typename lno_t, typename size_type,
-          typename device>
+          typename layout, typename device>
 void testBlockCrsMatrix_SpM_MV() {
   //
   // Test for the operation Y <- alpha * Op(A) * X + beta * Y
@@ -452,7 +462,7 @@ void testBlockCrsMatrix_SpM_MV() {
       auto alpha_s = static_cast<scalar_t>(testAlphaBeta[ii]);
       auto beta_s  = static_cast<scalar_t>(testAlphaBeta[ii + 1]);
       num_errors   = 0;
-      Test_BlockCrs::check_blockcrs_times_mv<scalar_t, lno_t, size_type,
+      Test_BlockCrs::check_blockcrs_times_mv<scalar_t, lno_t, size_type, layout,
                                              device>(&mode, alpha_s, beta_s,
                                                      bMax, num_errors);
       if (num_errors > 0) {
@@ -469,282 +479,49 @@ void testBlockCrsMatrix_SpM_MV() {
 
 //////////////////////////
 
-#define EXECUTE_BCRS_TIMES_VEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)            \
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)             \
   TEST_F(                                                                       \
       TestCategory,                                                             \
       sparse##_##bcrs_times_vec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
     testSpMVBlockCrsMatrix<SCALAR, ORDINAL, OFFSET, DEVICE>();                  \
   }
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_VEC_TEST(double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_VEC_TEST(double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_VEC_TEST(double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_VEC_TEST(double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_VEC_TEST(float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_VEC_TEST(float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_VEC_TEST(float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_VEC_TEST(float, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int64_t, size_t,
-                            TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int64_t, size_t,
-                            TestExecSpace)
-#endif
-
-#undef EXECUTE_BCRS_TIMES_VEC_TEST
+#include <Test_Common_Test_All_Type_Combos.hpp>
+
+#undef KOKKOSKERNELS_EXECUTE_TEST
 
 //////////////////////////
 
-#define EXECUTE_BCRS_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                \
-  TEST_F(                                                                            \
-      TestCategory,                                                                  \
-      sparse##_##bcrs_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
-    testBlockCrsMatrix_SpM_MV<SCALAR, ORDINAL, OFFSET, DEVICE>();                    \
+#define EXECUTE_BCRS_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE)                   \
+  TEST_F(                                                                                       \
+      TestCategory,                                                                             \
+      sparse##_##bcrs_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \
+    testBlockCrsMatrix_SpM_MV<SCALAR, ORDINAL, OFFSET, Kokkos::LAYOUT,                          \
+                              DEVICE>();                                                        \
   }
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_MVEC_TEST(double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_MVEC_TEST(double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_MVEC_TEST(double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_MVEC_TEST(double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_MVEC_TEST(float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_MVEC_TEST(float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_MVEC_TEST(float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_MVEC_TEST(float, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, size_t,
-                             TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, size_t,
-                             TestExecSpace)
-#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
+  EXECUTE_BCRS_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutLeft, \
+                               TestExecSpace)
+
+#include <Test_Common_Test_All_Type_Combos.hpp>
+
+#undef KOKKOSKERNELS_EXECUTE_TEST
+
+#endif  // KOKKOSKERNELS_INST_LAYOUTLEFT
+
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)  \
+  EXECUTE_BCRS_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutRight, \
+                               TestExecSpace)
+
+#include <Test_Common_Test_All_Type_Combos.hpp>
+
+#undef KOKKOSKERNELS_EXECUTE_TEST
+
+#endif  // KOKKOSKERNELS_INST_LAYOUTRIGHT
 
 #undef EXECUTE_BCRS_TIMES_MVEC_TEST
diff --git a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp
index e6d3b65ac5..344a203567 100644
--- a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp
+++ b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp
@@ -42,6 +42,7 @@
 //@HEADER
 */
 
+#include <algorithm>
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
 #include <stdexcept>
@@ -96,33 +97,29 @@ inline void set_random_value(std::complex<Scalar> &v) {
 /// \param mat_rowmap[out]  CRS-style row map for the block matrix
 /// \param mat_colidx[out]  CRS-style column entries for the block matrix
 /// \param mat_val[out]  Numerical (random) values
-template <typename scalar_t, typename lno_t, typename size_type>
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename rowmap_type, typename colidx_type, typename values_type>
 void make_block_entries(
     const KokkosSparse::CrsMatrix<scalar_t, lno_t, Kokkos::HostSpace, void,
                                   size_type> &mat_b1,
-    int blockSize, std::vector<lno_t> &mat_rowmap,
-    std::vector<lno_t> &mat_colidx, std::vector<scalar_t> &mat_val) {
-  lno_t nRow = blockSize * mat_b1.numRows();
+    int blockSize, rowmap_type &mat_rowmap, colidx_type &mat_colidx,
+    values_type &mat_val) {
   size_t nnz = static_cast<size_t>(blockSize) * static_cast<size_t>(blockSize) *
                mat_b1.nnz();
 
-  mat_val.resize(nnz);
   for (size_t ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]);
 
   //
   // Create graph for CrsMatrix
   //
 
-  mat_rowmap.assign(nRow + 1, 0);
-  mat_colidx.assign(nnz, 0);
-
   for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) {
-    const auto jbeg = mat_b1.graph.row_map(ir);
-    const auto jend = mat_b1.graph.row_map(ir + 1);
+    const size_type jbeg = mat_b1.graph.row_map(ir);
+    const size_type jend = mat_b1.graph.row_map(ir + 1);
     for (lno_t ib = 0; ib < blockSize; ++ib) {
       const lno_t my_row     = ir * blockSize + ib;
       mat_rowmap[my_row + 1] = mat_rowmap[my_row] + (jend - jbeg) * blockSize;
-      for (auto ijk = jbeg; ijk < jend; ++ijk) {
+      for (size_type ijk = jbeg; ijk < jend; ++ijk) {
         const auto col0 = mat_b1.graph.entries(ijk);
         for (lno_t jb = 0; jb < blockSize; ++jb) {
           mat_colidx[mat_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] =
@@ -177,17 +174,26 @@ void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta,
     size_type nnz = static_cast<size_type>(blockSize) *
                     static_cast<size_type>(blockSize) * mat_b1.nnz();
 
-    std::vector<lno_t> mat_rowmap(nRow + 1, 0);
-    std::vector<lno_t> mat_colidx(nnz, 0);
-    std::vector<scalar_t> mat_val(nnz);
+    Kokkos::View<size_type *, device> d_rowmap("crsmatrix", nRow + 1);
+    auto h_rowmap = Kokkos::create_mirror_view(d_rowmap);
+
+    Kokkos::View<lno_t *, device> d_colidx("crsmatrix", nnz);
+    auto h_colidx = Kokkos::create_mirror_view(d_colidx);
+
+    Kokkos::View<scalar_t *, device> d_matval("crsmatrix", nnz);
+    auto h_matval = Kokkos::create_mirror_view(d_matval);
 
     // Create the entries
-    make_block_entries<scalar_t, lno_t>(mat_b1, blockSize, mat_rowmap,
-                                        mat_colidx, mat_val);
+    make_block_entries<scalar_t, lno_t, size_type>(mat_b1, blockSize, h_rowmap,
+                                                   h_colidx, h_matval);
+
+    Kokkos::deep_copy(d_matval, h_matval);
+    Kokkos::deep_copy(d_colidx, h_colidx);
+    Kokkos::deep_copy(d_rowmap, h_rowmap);
 
     // Create the CrsMatrix for the reference computation
-    crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], &mat_rowmap[0],
-                  &mat_colidx[0]);
+    crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, d_matval, d_rowmap,
+                  d_colidx);
 
     x_vector_type xref("new_right_hand_side", nRow);
     auto h_xref = Kokkos::create_mirror_view(xref);
@@ -229,20 +235,21 @@ void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta,
     //
     // Compare the two products
     //
-    double error = 0.0, maxNorm = 0.0;
+    using KATS     = Kokkos::ArithTraits<scalar_t>;
+    using mag_type = typename KATS::mag_type;
+
+    const mag_type zero_mag = Kokkos::ArithTraits<mag_type>::zero();
+    mag_type error = zero_mag, maxNorm = zero_mag;
+
     Kokkos::deep_copy(h_ycrs, ycrs);
     Kokkos::deep_copy(h_ybsr, ybsr);
     for (lno_t ir = 0; ir < nRow; ++ir) {
-      error = std::max(
-          error, Kokkos::ArithTraits<scalar_t>::abs(h_ycrs(ir) - h_ybsr(ir)));
-      maxNorm =
-          std::max(maxNorm, Kokkos::ArithTraits<scalar_t>::abs(h_ycrs(ir)));
+      error   = std::max<mag_type>(error, KATS::abs(h_ycrs(ir) - h_ybsr(ir)));
+      maxNorm = std::max<mag_type>(maxNorm, KATS::abs(h_ycrs(ir)));
     }
 
-    double tmps =
-        static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(alpha)) +
-        static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(beta));
-    if ((tmps > 0.0) && (maxNorm == 0)) {
+    mag_type tmps = KATS::abs(alpha) + KATS::abs(beta);
+    if ((tmps > zero_mag) && (maxNorm == zero_mag)) {
       std::cout << " BSR - SpMV times MV >> blockSize " << blockSize
                 << " maxNorm " << maxNorm << " error " << error << " alpha "
                 << alpha << " beta " << beta << "\n";
@@ -252,9 +259,8 @@ void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta,
     //
     // --- Factor ((nnz / nRow) + 1) = Average number of non-zeros per row
     //
-    const auto tol = ((nnz / nRow) + 1) *
-                     static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(
-                         Kokkos::ArithTraits<scalar_t>::epsilon()));
+    const mag_type tol = ((static_cast<mag_type>(nnz) / nRow) + 1) *
+                         Kokkos::ArithTraits<mag_type>::epsilon();
     if (error > tol * maxNorm) {
       std::cout << " BSR - SpMV times V >> blockSize " << blockSize << " ratio "
                 << error / maxNorm << " tol " << tol << " maxNorm " << maxNorm
@@ -267,7 +273,7 @@ void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta,
 
 /// \brief Driver routine for checking BsrMatrix times multiple vector
 template <typename scalar_t, typename lno_t, typename size_type,
-          typename device>
+          typename layout, typename device>
 void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta,
                          const lno_t bMax, int &num_errors) {
   // The mat_structure view is used to generate a matrix using
@@ -291,7 +297,7 @@ void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta,
   typedef
       typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
           crsMat_t;
-  typedef Kokkos::View<scalar_t **, Kokkos::LayoutLeft, device> block_vector_t;
+  typedef Kokkos::View<scalar_t **, layout, device> block_vector_t;
 
   h_crsMat_t mat_b1 =
       Test::generate_structured_matrix3D<h_crsMat_t>("FD", mat_structure);
@@ -309,17 +315,26 @@ void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta,
     size_type nnz = static_cast<size_type>(blockSize) *
                     static_cast<size_type>(blockSize) * mat_b1.nnz();
 
-    std::vector<lno_t> mat_rowmap(nRow + 1, 0);
-    std::vector<lno_t> mat_colidx(nnz, 0);
-    std::vector<scalar_t> mat_val(nnz);
+    Kokkos::View<size_type *, device> d_rowmap("crsmatrix", nRow + 1);
+    auto h_rowmap = Kokkos::create_mirror_view(d_rowmap);
+
+    Kokkos::View<lno_t *, device> d_colidx("crsmatrix", nnz);
+    auto h_colidx = Kokkos::create_mirror_view(d_colidx);
+
+    Kokkos::View<scalar_t *, device> d_matval("crsmatrix", nnz);
+    auto h_matval = Kokkos::create_mirror_view(d_matval);
 
     // Create the entries
-    make_block_entries<scalar_t, lno_t>(mat_b1, static_cast<int>(blockSize),
-                                        mat_rowmap, mat_colidx, mat_val);
+    make_block_entries<scalar_t, lno_t, size_type>(mat_b1, blockSize, h_rowmap,
+                                                   h_colidx, h_matval);
+
+    Kokkos::deep_copy(d_matval, h_matval);
+    Kokkos::deep_copy(d_colidx, h_colidx);
+    Kokkos::deep_copy(d_rowmap, h_rowmap);
 
     // Create the CrsMatrix for the reference computation
-    crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], &mat_rowmap[0],
-                  &mat_colidx[0]);
+    crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, d_matval, d_rowmap,
+                  d_colidx);
 
     block_vector_t xref("new_right_hand_side", nRow, nrhs);
     auto h_xref = Kokkos::create_mirror_view(xref);
@@ -366,29 +381,29 @@ void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta,
     //
     // Compare the two products
     //
-    double error = 0.0, maxNorm = 0.0;
+    using KATS     = Kokkos::ArithTraits<scalar_t>;
+    using mag_type = typename KATS::mag_type;
+
+    const mag_type zero_mag = Kokkos::ArithTraits<mag_type>::zero();
+    mag_type error = zero_mag, maxNorm = zero_mag;
     for (int jc = 0; jc < nrhs; ++jc) {
       for (int ir = 0; ir < nRow; ++ir) {
-        error   = std::max(error, Kokkos::ArithTraits<scalar_t>::abs(
-                                    h_ycrs(ir, jc) - h_ybsr(ir, jc)));
-        maxNorm = std::max(maxNorm,
-                           Kokkos::ArithTraits<scalar_t>::abs(h_ycrs(ir, jc)));
+        error   = std::max<mag_type>(error,
+                                   KATS::abs(h_ycrs(ir, jc) - h_ybsr(ir, jc)));
+        maxNorm = std::max<mag_type>(maxNorm, KATS::abs(h_ycrs(ir, jc)));
       }
     }
 
-    double tmps =
-        static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(alpha)) +
-        static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(beta));
-    if ((tmps > 0.0) && (maxNorm == 0)) {
+    mag_type tmps = KATS::abs(alpha) + KATS::abs(beta);
+    if ((tmps > zero_mag) && (maxNorm == zero_mag)) {
       std::cout << " BSR - SpMV times MV >> blockSize " << blockSize
                 << " maxNorm " << maxNorm << " error " << error << " alpha "
                 << alpha << " beta " << beta << "\n";
       num_errors += 1;
     }
 
-    auto tol = ((nnz / nRow) + 1) *
-               static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(
-                   Kokkos::ArithTraits<scalar_t>::epsilon()));
+    const mag_type tol = ((static_cast<mag_type>(nnz) / nRow) + 1) *
+                         Kokkos::ArithTraits<mag_type>::epsilon();
     if (error > tol * maxNorm) {
       std::cout << " BSR - SpMV times MV >> blockSize " << blockSize
                 << " ratio " << error / maxNorm << " tol " << tol << " maxNorm "
@@ -531,7 +546,7 @@ void testSpMVBsrMatrix() {
 }
 
 template <typename scalar_t, typename lno_t, typename size_type,
-          typename device>
+          typename layout, typename device>
 void testBsrMatrix_SpM_MV() {
   //
   // Test for the operation Y <- alpha * Op(A) * X + beta * Y
@@ -558,7 +573,7 @@ void testBsrMatrix_SpM_MV() {
       auto alpha_s = static_cast<scalar_t>(testAlphaBeta[ii]);
       auto beta_s  = static_cast<scalar_t>(testAlphaBeta[ii + 1]);
       num_errors   = 0;
-      Test_Bsr::check_bsrm_times_mv<scalar_t, lno_t, size_type, device>(
+      Test_Bsr::check_bsrm_times_mv<scalar_t, lno_t, size_type, layout, device>(
           &mode, alpha_s, beta_s, bMax, num_errors);
       if (num_errors > 0) {
         printf(
@@ -574,281 +589,48 @@ void testBsrMatrix_SpM_MV() {
 
 //////////////////////////
 
-#define EXECUTE_BSR_TIMES_VEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)               \
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)               \
   TEST_F(                                                                         \
       TestCategory,                                                               \
       sparse##_##bsrmat_times_vec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
     testSpMVBsrMatrix<SCALAR, ORDINAL, OFFSET, DEVICE>();                         \
   }
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_VEC_TEST(double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_VEC_TEST(double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_VEC_TEST(double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_VEC_TEST(double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_VEC_TEST(float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_VEC_TEST(float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_VEC_TEST(float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_VEC_TEST(float, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int64_t, size_t,
-                           TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
-#endif
-
-#undef EXECUTE_BSR_TIMES_VEC_TEST
+#include <Test_Common_Test_All_Type_Combos.hpp>
+
+#undef KOKKOSKERNELS_EXECUTE_TEST
 
 //////////////////////////
 
-#define EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                   \
-  TEST_F(                                                                              \
-      TestCategory,                                                                    \
-      sparse##_##bsrmat_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
-    testBsrMatrix_SpM_MV<SCALAR, ORDINAL, OFFSET, DEVICE>();                           \
+#define EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE)                      \
+  TEST_F(                                                                                         \
+      TestCategory,                                                                               \
+      sparse##_##bsrmat_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \
+    testBsrMatrix_SpM_MV<SCALAR, ORDINAL, OFFSET, Kokkos::LAYOUT, DEVICE>();                      \
   }
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_MVEC_TEST(double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_MVEC_TEST(double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_MVEC_TEST(double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_MVEC_TEST(double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_MVEC_TEST(float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_MVEC_TEST(float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_MVEC_TEST(float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_MVEC_TEST(float, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, size_t,
-                            TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, size_t,
-                            TestExecSpace)
-#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
+  EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutLeft,  \
+                              TestExecSpace)
+
+#include <Test_Common_Test_All_Type_Combos.hpp>
+
+#undef KOKKOSKERNELS_EXECUTE_TEST
+
+#endif  // KOKKOSKERNELS_INST_LAYOUTLEFT
+
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
+  EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutRight, \
+                              TestExecSpace)
+
+#include <Test_Common_Test_All_Type_Combos.hpp>
+
+#undef KOKKOSKERNELS_EXECUTE_TEST
+
+#endif  // KOKKOSKERNELS_INST_LAYOUTRIGHT
 
 #undef EXECUTE_BSR_TIMES_MVEC_TEST
diff --git a/unit_test/sparse/Test_Sparse_sptrsv.hpp b/unit_test/sparse/Test_Sparse_sptrsv.hpp
index 1be27d0c9c..c470747202 100644
--- a/unit_test/sparse/Test_Sparse_sptrsv.hpp
+++ b/unit_test/sparse/Test_Sparse_sptrsv.hpp
@@ -45,12 +45,11 @@
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
 
-#include <Kokkos_Concepts.hpp>
 #include <string>
 #include <stdexcept>
 
 #include "KokkosKernels_IOUtils.hpp"
-#include "KokkosKernels_SparseUtils.hpp"
+#include "KokkosSparse_Utils.hpp"
 #include "KokkosSparse_spmv.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
 
@@ -122,7 +121,7 @@ void run_test_sptrsv_mtx() {
     bool is_lower_tri = true;
     std::cout << "Create handle" << std::endl;
     kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri);
-    
+
     std::cout << "Prepare linear system" << std::endl;
     // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs
     ValuesType known_lhs("known_lhs", nrows);
@@ -239,7 +238,7 @@ void run_test_sptrsv_mtx() {
     bool is_lower_tri = false;
     std::cout << "Create handle" << std::endl;
     kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri);
-    
+
     std::cout << "Prepare linear system" << std::endl;
     // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs
     ValuesType known_lhs("known_lhs", nrows);
@@ -1087,138 +1086,12 @@ void test_sptrsv() {
   //  Test::run_test_sptrsv_mtx<scalar_t, lno_t, size_type, device>();
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                      \
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)        \
   TEST_F(TestCategory,                                                     \
          sparse##_##sptrsv##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
     test_sptrsv<SCALAR, ORDINAL, OFFSET, DEVICE>();                        \
   }
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
-#endif
+#include <Test_Common_Test_All_Type_Combos.hpp>
 
-#undef EXECUTE_TEST
+#undef KOKKOSKERNELS_EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_trsv.hpp b/unit_test/sparse/Test_Sparse_trsv.hpp
index fce73897a8..9a23f48883 100644
--- a/unit_test/sparse/Test_Sparse_trsv.hpp
+++ b/unit_test/sparse/Test_Sparse_trsv.hpp
@@ -11,6 +11,7 @@
 #include <KokkosSparse_spmv.hpp>
 #include <KokkosKernels_TestUtils.hpp>
 #include <KokkosKernels_IOUtils.hpp>
+#include <KokkosSparse_IOUtils.hpp>
 
 #include <KokkosKernels_Utils.hpp>
 
@@ -76,8 +77,12 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth,
   // this function creates a dense lower and upper triangular matrix.
   // TODO: SHOULD CHANGE IT TO SPARSE
   crsMat_t lower_part =
-      KokkosKernels::Impl::kk_generate_triangular_sparse_matrix<crsMat_t>(
+      KokkosSparse::Impl::kk_generate_triangular_sparse_matrix<crsMat_t>(
           'L', numRows, numCols, nnz, row_size_variance, bandwidth);
+
+  Test::shuffleMatrixEntries(lower_part.graph.row_map, lower_part.graph.entries,
+                             lower_part.values);
+
   KokkosSparse::spmv("N", alpha, lower_part, b_x_copy, beta, b_y);
   Test::check_trsv_mv(lower_part, b_x, b_y, b_x_copy, numMV, "L", "N");
 
@@ -86,8 +91,12 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth,
   // typedef typename Kokkos::View<lno_t*, layout, Device> indexview;
 
   crsMat_t upper_part =
-      KokkosKernels::Impl::kk_generate_triangular_sparse_matrix<crsMat_t>(
+      KokkosSparse::Impl::kk_generate_triangular_sparse_matrix<crsMat_t>(
           'U', numRows, numCols, nnz, row_size_variance, bandwidth);
+
+  Test::shuffleMatrixEntries(upper_part.graph.row_map, upper_part.graph.entries,
+                             upper_part.values);
+
   KokkosSparse::spmv("N", alpha, upper_part, b_x_copy, beta, b_y);
   Test::check_trsv_mv(upper_part, b_x, b_y, b_x_copy, numMV, "U", "N");
 
@@ -95,309 +104,46 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth,
   Test::check_trsv_mv(upper_part, b_x, b_y, b_x_copy, numMV, "U", "T");
 }
 
+// Note BMK 7-22: the matrix generator used by this test always
+// generates a dense triangle. It ignores bandwidth, nnz and row size variance.
+
 #define EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE)                    \
   TEST_F(                                                                           \
       TestCategory,                                                                 \
       sparse##_##trsv_mv##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \
     test_trsv_mv<SCALAR, ORDINAL, OFFSET, Kokkos::LAYOUT, DEVICE>(                  \
-        5000, 5000 * 30, 200, 10, 1);                                               \
+        1000, 1000 * 30, 200, 10, 1);                                               \
     test_trsv_mv<SCALAR, ORDINAL, OFFSET, Kokkos::LAYOUT, DEVICE>(                  \
-        5000, 5000 * 30, 100, 10, 5);                                               \
+        800, 800 * 30, 100, 10, 5);                                                 \
     test_trsv_mv<SCALAR, ORDINAL, OFFSET, Kokkos::LAYOUT, DEVICE>(                  \
-        1000, 1000 * 20, 100, 5, 10);                                               \
+        400, 400 * 20, 100, 5, 10);                                                 \
   }
 
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&  \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(double, int, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&      \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(double, int64_t, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&     \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(double, int, size_t, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&      \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(double, int64_t, size_t, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&  \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(float, int, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&      \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(float, int64_t, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&     \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(float, int, size_t, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&      \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(float, int64_t, size_t, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&             \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&             \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&             \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&             \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutLeft,
-                TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutLeft,
-                TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(double, int, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&     \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
      !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(double, int64_t, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(double, int, size_t, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&     \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(double, int64_t, size_t, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(float, int, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&     \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(float, int64_t, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&    \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(float, int, size_t, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&     \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(float, int64_t, size_t, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&            \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutRight,
-                TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
-     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
-     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
-     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&           \
-     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
-    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
+  EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace)
+
+#include <Test_Common_Test_All_Type_Combos.hpp>
+
+#undef KOKKOSKERNELS_EXECUTE_TEST
+
+#endif  // KOKKOSKERNELS_INST_LAYOUTLEFT
+
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
      !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutRight,
-                TestExecSpace)
-#endif
+
+#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
+  EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace)
+
+#include <Test_Common_Test_All_Type_Combos.hpp>
+
+#undef KOKKOSKERNELS_EXECUTE_TEST
+
+#endif  // KOKKOSKERNELS_INST_LAYOUTRIGHT
 
 #undef EXECUTE_TEST_MV